<a href="https://colab.research.google.com/github/MohammedHamood/20NewsGroup/blob/main/20NewsGroup_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 20 News Group - Baselines

## Data Pre-Processing

In [None]:
import pandas as pd
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import preprocessingNLP as PNLP
import numpy as np
import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA, NMF, TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer,TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import Normalizer

# Import Dataset
print("Fetching Dataset ...")
!wget -nv "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
!tar -xf aclImdb_v1.tar.gz
Newsgroup_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42, remove=['headers', 'footers', 'quotes'])
Newsgroup_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42, remove=['headers', 'footers', 'quotes'])
print("Dataset Fetched")

# Preprocessing
print("PREPROCESSING ...")
Newsgroup_train.data = PNLP.customNLP(Newsgroup_train.data)
Newsgroup_test.data = PNLP.customNLP(Newsgroup_test.data)
Newsgroup_train.data, Newsgroup_train.target = PNLP.removeEmptyInstances(Newsgroup_train.data, Newsgroup_train.target)
print("PREPROCESSING DONE!")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
Fetching Dataset ...
2020-03-11 05:52:03 URL:http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz [84125825/84125825] -> "aclImdb_v1.tar.gz" [1]


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


Dataset Fetched
PREPROCESSING ...


  ' Beautiful Soup.' % markup)


PREPROCESSING DONE!


## Baseline for All Models
Performs cross validation for all models with default params to obtain baseline accuracies

### Logistic Regression
Default Parameters: <br />
penalty: l2 <br />
tol: 1e-4 <br />
C: 1 <br />
solver: lbfgs (only handles l2 or no penalty) <br />
max_iter: 100 <br />
n_jobs: None


In [None]:
# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', LogisticRegression())])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)
print("Train_Acc: %s" % (scores))

Runtime: 318.0961935520172 seconds


0.7569608735213831

### SVM
Default params: <br />
penalty: l2 <br />
loss: squared_hing <br />
tol: 1e-4 <br />
C: 1 <br />
multi_class: 'ovr' <br />
max_iter: 1000

In [None]:

# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', LinearSVC())])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 22.728375911712646 seconds


0.7778889899909008

### Decision Trees
criterion: 'gini' <br />
splitter: 'best' <br />
max_depth: 'None' <br />
min_samples_split: 2 <br />
min_samples_leaf: 1 <br />
max_features: None <br />
max_leaf_nodes: None <br />
ccp_alpha: 0

In [None]:

# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', DecisionTreeClassifier())])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 152.30219268798828 seconds


0.4871701546860782

### Random Forest
n_estimator: 100 <br />
criterion: 'gini' <br />
max_depth: None <br />
min_samples_split: 2 <br />
min_samples_leaf: 1 <br />
max_features: 'auto' <br />
max_leaf_nodes: None <br />
n_jobs: None <br />
ccp_alpha: 0.0 <br />
max_samples: None <br />


In [None]:

# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', RandomForestClassifier())])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=3)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 86.77589225769043 seconds


0.6577802743195756

### RF with Adaboost

In [None]:

# Create a pipeline


pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('clf', AdaBoostClassifier(RandomForestClassifier(n_jobs=-1)))])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=3)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 1779.6300780773163 seconds


0.5797098954850046

### DT with Adaboost

In [None]:

# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', DecisionTreeClassifier())])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 140.89277458190918 seconds


0.4878070973612375

### SVM with Adaboost

In [None]:

# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', AdaBoostClassifier(LinearSVC(), algorithm='SAMME'))])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 427.1325800418854 seconds


0.5448589626933575

### LR with Adaboost

In [None]:
# Create a pipeline
pip = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
                ('Norm', Normalizer(copy=False)),('clf', AdaBoostClassifier(LogisticRegression()))])


start_time = time.time()
scores = cross_val_score(pip, Newsgroup_train.data, Newsgroup_train.target, cv=10)
print("Runtime: %s seconds" % (time.time() - start_time))
np.mean(scores)

Runtime: 927.720205783844 seconds


0.4714285714285714