In [4]:
# !pip install scikit-learn

# Hyper Parameter Optimisation Implementation

In [20]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the Dataset
newsgroups = fetch_20newsgroups(subset='all')

# Preprocess the Text (Add any preprocessing required)

# Step 2: Feature Extraction (TF-IDF Vectorization)
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
X = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Step 3: Split the Data (Training + Validation, and Test Set)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Split Training + Validation into Training and Validation
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.125, random_state=42)

# Step 5: Set up SGDClassifier and GridSearchCV for Hyperparameter Tuning
# Define parameter grid for hyperparameter tuning
param_grid = {
    'loss': ['hinge', 'log_loss'],               # hinge is SVM, log_loss is logistic regression
    'penalty': ['l2', 'l1'],#, 'elasticnet'],  # Regularization types
    'alpha': [1e-4],#, 1e-3, 1e-2],           # Regularization strength
    'max_iter': [100],#, 200, 300],         # Number of epochs
    'tol': [None]#[1e-3, 1e-4, None]               # Stopping criterion
}

# Initialize SGDClassifier
clf = SGDClassifier(random_state=42)

# Step 6: Perform Grid Search with Cross-Validation to Find the Best Hyperparameters
grid_search = GridSearchCV(clf, param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Step 7: Retrieve Best Hyperparameters
print(f"Best Parameters: {grid_search.best_params_}")

# Step 8: Evaluate the Best Model on Validation Set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)
print("\nValidation Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nValidation Classification Report:\n", classification_report(y_val, y_val_pred))

# Step 9: Train the Best Model on Full Training Data (Training + Validation)
best_model.fit(X_train_val, y_train_val)

# Step 10: Evaluate on the Test Set
y_test_pred = best_model.predict(X_test)
print("\nTest Accuracy:", accuracy_score(y_test, y_test_pred))
print("\nTest Classification Report:\n", classification_report(y_test, y_test_pred))

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best Parameters: {'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 100, 'penalty': 'l2', 'tol': None}

Validation Accuracy: 0.919893899204244

Validation Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.91      0.92        92
           1       0.89      0.86      0.88        86
           2       0.76      0.91      0.83        77
           3       0.86      0.84      0.85        94
           4       0.94      0.92      0.93        85
           5       0.91      0.89      0.90       122
           6       0.91      0.90      0.91       102
           7       0.95      0.90      0.92        99
           8       0.92      0.97      0.94       104
           9       0.98      0.97      0.97        87
          10       0.96      1.00      0.98        90
          11       0.99      0.98      0.98        87
          12       0.88      0.87      0.87       105
        

# Basic Implementation

In [16]:
# from sklearn.datasets import fetch_20newsgroups
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.linear_model import SGDClassifier  # SVM with Stochastic Gradient Descent
# from sklearn.metrics import classification_report, accuracy_score

# newsgroups = fetch_20newsgroups(subset='all')

# vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
# dataX = vectorizer.fit_transform(newsgroups.data)
# dataY = newsgroups.target

# train_ratio = 0.75
# validation_ratio = 0.15
# test_ratio = 0.10

# # train is now 75% of the entire data set
# x_train, x_test, y_train, y_test = train_test_split(dataX, dataY, test_size=1 - train_ratio)

# # test is now 10% of the initial data set
# # validation is now 15% of the initial data set
# x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio)) 

# # print(x_test, x_val, x_test)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 203673 stored elements and shape (1885, 173446)>
  Coords	Values
  (0, 117326)	0.03520736385526365
  (0, 127278)	0.03414378326507175
  (0, 86192)	0.034998249575808695
  (0, 142775)	0.1003998181797237
  (0, 158383)	0.03673927189303268
  (0, 153225)	0.09653744214061885
  (0, 51358)	0.10881440012306128
  (0, 36586)	0.10553853795199583
  (0, 80473)	0.04963035075732454
  (0, 126248)	0.09488883531832912
  (0, 63950)	0.056199618813652735
  (0, 164631)	0.09846754125958262
  (0, 100319)	0.0916930094910735
  (0, 103015)	0.08554100023639419
  (0, 46845)	0.11152760387975934
  (0, 133449)	0.05599426571422951
  (0, 126091)	0.07033178984481478
  (0, 129553)	0.10389854340858318
  (0, 65132)	0.08475881961670258
  (0, 131486)	0.11623587611176993
  (0, 135415)	0.08092547129365302
  (0, 124529)	0.12420409404274306
  (0, 124523)	0.09088308373927181
  (0, 4077)	0.06843927845756899
  (0, 140581)	0.11330232967450017
  :	:
  (1884, 69190)	0.07700209

In [5]:
# Import necessary libraries
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier  # SVM with Stochastic Gradient Descent
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Load the Dataset
newsgroups = fetch_20newsgroups(subset='all')

# Step 2: Preprocess the Text (Basic preprocessing)
# For demonstration, we'll use raw data; further preprocessing can be applied if needed

# Step 3: Feature Extraction
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5)
x = vectorizer.fit_transform(newsgroups.data)
y = newsgroups.target

# Step 4: Split Data into Training and Testing Sets
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42)

# Step 5: Train a Classification Model
clf = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3,
                    random_state=42, max_iter=5, tol=None)
clf.fit(x_train, y_train)

# Step 6: Evaluate the Model
y_pred = clf.predict(x_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8819628647214854

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.84       151
           1       0.88      0.81      0.84       202
           2       0.78      0.88      0.82       195
           3       0.73      0.73      0.73       183
           4       0.90      0.87      0.89       205
           5       0.91      0.86      0.89       215
           6       0.84      0.81      0.83       193
           7       0.89      0.95      0.92       196
           8       0.90      0.94      0.92       168
           9       0.97      0.93      0.95       211
          10       0.87      0.99      0.93       198
          11       0.93      0.98      0.95       201
          12       0.95      0.74      0.83       202
          13       0.95      0.95      0.95       194
          14       0.87      0.99      0.92       189
          15       0.81      0.98      0.88       202
          16       0.89    

In [12]:
# Step 7: Classify New Text
def classify_new_text(text, vectorizer, clf):
    # Preprocess and vectorize the input text using the same TfidfVectorizer
    text_vectorized = vectorizer.transform([text])
    
    # Predict the category of the input text
    predicted_category = clf.predict(text_vectorized)
    
    # Get the actual category name (since target names are stored in newsgroups.target_names)
    category_name = newsgroups.target_names[predicted_category[0]]
    
    return category_name

# Example of classifying new text
new_text = "Steve went and shot up the school because of Trump"#"Apple just released a new iPhone with improved camera technology."
predicted_category = classify_new_text(new_text, vectorizer, clf)
print(f"The new text was classified as: {predicted_category}")

The new text was classified as: sci.med


# Alternate ChatGPT implementation

In [None]:
# # Install the required libraries
# !pip install scikit-learn numpy pandas nltk

In [None]:
# from sklearn.datasets import fetch_20newsgroups
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import TfidfVectorizer

# # Load the 20 newsgroups dataset
# categories = ['alt.atheism', 'comp.graphics', 'sci.med', 'rec.sport.baseball']
# newsgroups_data = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(newsgroups_data.data, newsgroups_data.target, test_size=0.2, random_state=42)

# # Initialize a TF-IDF Vectorizer to convert text to numerical vectors
# tfidf = TfidfVectorizer(stop_words='english', max_df=0.7)
# X_train_tfidf = tfidf.fit_transform(X_train)
# X_test_tfidf = tfidf.transform(X_test)

In [None]:
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import accuracy_score, classification_report

# # Train a Naive Bayes classifier
# nb_classifier = MultinomialNB()
# nb_classifier.fit(X_train_tfidf, y_train)

# # Predict on the test set
# y_pred = nb_classifier.predict(X_test_tfidf)

# # Evaluate the classifier
# accuracy = accuracy_score(y_test, y_pred)
# print(f"Naive Bayes Accuracy: {accuracy * 100:.2f}%")
# print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=newsgroups_data.target_names))