<h3>Task 1 </h3>

In [None]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

<h3> Task 2 </h3>

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)
nb_classifier = MultinomialNB()
nb_pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', nb_classifier)
])
nb_pipeline.fit(X_train, y_train)
nb_predictions = nb_pipeline.predict(X_test)

svm_classifier = SVC()
svm_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', svm_classifier)
])
svm_pipeline.fit(X_train, y_train)
svm_predictions = svm_pipeline.predict(X_test)

rf_classifier = RandomForestClassifier()
rf_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', rf_classifier)
])
rf_pipeline.fit(X_train, y_train)
rf_predictions = rf_pipeline.predict(X_test)

def evaluate_classifier(predictions, classifier_name):
    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=newsgroups_data.target_names)
    print(f"Accuracy of {classifier_name}: {accuracy:.2f}")
    print(f"Classification Report for {classifier_name}:\n{report}\n")

evaluate_classifier(nb_predictions, "Naïve Bayes")
evaluate_classifier(svm_predictions, "SVM")
evaluate_classifier(rf_predictions, "Random Forest")


<h3> Task 3</h3>

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from concurrent.futures import ThreadPoolExecutor

newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

feature_methods = {
    'Counts': CountVectorizer(),
    'Term Frequency (TF)': TfidfVectorizer(use_idf=False),
    'Term Frequency-Inverse Document Frequency (TF-IDF)': TfidfVectorizer(use_idf=True)
}

classifiers = {
    'Naïve Bayes': MultinomialNB(),
    'SVM': SVC(),
    'Random Forest': RandomForestClassifier()
}

def train_and_evaluate_classifier(classifier_name, feature_name, classifier, feature_extractor):
    pipeline = Pipeline([
        ('vectorizer', feature_extractor),
        ('classifier', classifier)
    ])

    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=newsgroups_data.target_names)

    print(f"Using {feature_name} with {classifier_name}:")
    print(f"Accuracy of {classifier_name}: {accuracy:.2f}")
    print(f"Classification Report for {classifier_name}:\n{report}\n")

with ThreadPoolExecutor(max_workers=8) as executor:
    for feature_name, feature_extractor in feature_methods.items():
        for classifier_name, classifier in classifiers.items():
            executor.submit(train_and_evaluate_classifier, classifier_name, feature_name, classifier, feature_extractor)


<h3> Task 4</h3>

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

newsgroups_data = fetch_20newsgroups(subset='all', shuffle=True, random_state=42)

X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

rf_classifier = RandomForestClassifier()

param_grid = [
    {
        'lowercase': True,
        'stop_words': None,
        'analyzer': 'word',
        'ngram_range': (1, 1),
        'max_features': None
    },
    {
        'lowercase': False,
        'stop_words': 'english',
        'analyzer': 'word',
        'ngram_range': (1, 2),
        'max_features': 1000
    },
    {
        'lowercase': True,
        'stop_words': None,
        'analyzer': 'char',
        'ngram_range': (1, 2),
        'max_features': 5000
    }
]

def fit_classifier_with_params(params):
    vectorizer = CountVectorizer(lowercase=params['lowercase'],
                                 stop_words=params['stop_words'],
                                 analyzer=params['analyzer'],
                                 ngram_range=params['ngram_range'],
                                 max_features=params['max_features'])

    pipeline = Pipeline([
        ('vectorizer', vectorizer),
        ('classifier', rf_classifier)
    ])

    pipeline.fit(X_train, y_train)

    return pipeline

for params in param_grid:
    pipeline = fit_classifier_with_params(params)
    predictions = pipeline.predict(X_test)

    accuracy = accuracy_score(y_test, predictions)
    report = classification_report(y_test, predictions, target_names=newsgroups_data.target_names)

    print("CountVectorizer Parameters:", params)
    print("Random Forest Accuracy: {:.2f}".format(accuracy))
    print("Classification Report:\n", report)
    print()
