In [231]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fullc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [232]:
train_data = pd.read_csv('NewsgroupTopic.csv')
train_data['text']

0                            Moto 56001 DSP          ...
1                    Does anyone know of an ftp site ...
2                 sci-fi  and  fantasy artwork by man...
3                 you can come get  em  in  person.  ...
4                --hotels are selected from major hot...
                             ...                        
309     window size is "  max  it still scrolls aroun...
310     with Excell 4.0 and then exporting the data a...
311     with it.  There are  actually  3 versions:  a...
312     WSM is a compilation of submissions from shar...
313     Yes  both dblspaced and non-dblspaced drives ...
Name: text, Length: 314, dtype: object

In [233]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = tokenizer.tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)

In [234]:
train_data = pd.read_csv('NewsgroupTopic.csv')
train_data['text_remove_stopwords'] = train_data['text'].apply(remove_stopwords)

In [235]:
train_data['text_remove_stopwords']

0      Moto 56001 DSP Megapixel perfect dimming shaki...
1      Does anyone know ftp site I get pkunzip2 04g I...
2      sci fi fantasy artwork many masters All mint e...
3      come get em person All GREAT condition These g...
4      hotels selected major hotel chains family reso...
                             ...                        
309    window size max still scrolls around 2 3 windo...
310    Excell 4 0 exporting data comma seperated vari...
311    There actually 3 versions plain dos version 38...
312    WSM compilation submissions shareware freeware...
313    Yes dblspaced non dblspaced drives defragmente...
Name: text_remove_stopwords, Length: 314, dtype: object

In [236]:
from nltk.stem import WordNetLemmatizer

In [237]:
def lemmas(text):
    lemmatiser = WordNetLemmatizer()
    word_tokens = word_tokenize(text)
    lemmas = [lemmatiser.lemmatize(word) for word in word_tokens]
    return " ".join(lemmas)

In [238]:
train_data['text_lemmatised'] = train_data['text_remove_stopwords'].apply(lemmas)

In [239]:
train_data['text_lemmatised']

0      Moto 56001 DSP Megapixel perfect dimming shaki...
1      Does anyone know ftp site I get pkunzip2 04g I...
2      sci fi fantasy artwork many master All mint ex...
3      come get em person All GREAT condition These g...
4      hotel selected major hotel chain family resort...
                             ...                        
309    window size max still scroll around 2 3 window...
310    Excell 4 0 exporting data comma seperated vari...
311    There actually 3 version plain do version 386 ...
312    WSM compilation submission shareware freeware ...
313    Yes dblspaced non dblspaced drive defragmented...
Name: text_lemmatised, Length: 314, dtype: object

In [240]:
train_data['text_lowercased']  = train_data['text_lemmatised'].str.lower()

In [241]:
train_data['text_lowercased']

0      moto 56001 dsp megapixel perfect dimming shaki...
1      does anyone know ftp site i get pkunzip2 04g i...
2      sci fi fantasy artwork many master all mint ex...
3      come get em person all great condition these g...
4      hotel selected major hotel chain family resort...
                             ...                        
309    window size max still scroll around 2 3 window...
310    excell 4 0 exporting data comma seperated vari...
311    there actually 3 version plain do version 386 ...
312    wsm compilation submission shareware freeware ...
313    yes dblspaced non dblspaced drive defragmented...
Name: text_lowercased, Length: 314, dtype: object

In [242]:
punctuations="?:!.,;`/:"
for word in train_data['text_lowercased']:
        if word in train_data['text_lowercased']:
            train_data['text_lowercased'].remove(word)

In [243]:
train_data['text_lowercased']

0      moto 56001 dsp megapixel perfect dimming shaki...
1      does anyone know ftp site i get pkunzip2 04g i...
2      sci fi fantasy artwork many master all mint ex...
3      come get em person all great condition these g...
4      hotel selected major hotel chain family resort...
                             ...                        
309    window size max still scroll around 2 3 window...
310    excell 4 0 exporting data comma seperated vari...
311    there actually 3 version plain do version 386 ...
312    wsm compilation submission shareware freeware ...
313    yes dblspaced non dblspaced drive defragmented...
Name: text_lowercased, Length: 314, dtype: object

In [386]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2,SelectFpr

In [391]:
# create tfidf matrix for training and validation datasets
tfidfconverter = TfidfVectorizer()
train_data_tfidf = tfidfconverter.fit_transform(train_data['text_lowercased'])

train_data_tfidf_new = SelectKBest(chi2, k=920).fit_transform(train_data_tfidf, train_data['Class'])

X_train, X_test, y_train, y_test = train_test_split(train_data_tfidf_new, train_data['Class'], test_size=0.2, random_state=0)


In [392]:
print("X_train", X_train.shape)
print("X_test", X_test.shape)
print("y_train", y_train.shape)
print("y_test", y_test.shape)

X_train (251, 920)
X_test (63, 920)
y_train (251,)
y_test (63,)


In [393]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier()

In [394]:
clf.fit(X_train, y_train) 

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [395]:
from sklearn.pipeline import Pipeline

In [396]:
text_clf = Pipeline([
            ('clf', SGDClassifier()),
            ])

In [397]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('clf',
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None,
                               penalty='l2', power_t=0.5, random_state=None,
                               shuffle=True, tol=0.001, validation_fraction=0.1,
                               verbose=0, warm_start=False))],
         verbose=False)

In [398]:
y_pred = text_clf.predict(X_test)


print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


[[14  0  1  0]
 [ 1 20  0  1]
 [ 0  3  9  0]
 [ 0  2  2 10]]
              precision    recall  f1-score   support

     atheism       0.93      0.93      0.93        15
     forsale       0.80      0.91      0.85        22
    religion       0.75      0.75      0.75        12
     windows       0.91      0.71      0.80        14

    accuracy                           0.84        63
   macro avg       0.85      0.83      0.83        63
weighted avg       0.85      0.84      0.84        63

0.8412698412698413


# Naive Bayes

In [399]:
clf = MultinomialNB().fit(X_train, y_train)
y_pred = clf.predict(X_test)


print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))


[[15  0  0  0]
 [ 2 14  1  5]
 [ 0  0 11  1]
 [ 1  0  1 12]]
              precision    recall  f1-score   support

     atheism       0.83      1.00      0.91        15
     forsale       1.00      0.64      0.78        22
    religion       0.85      0.92      0.88        12
     windows       0.67      0.86      0.75        14

    accuracy                           0.83        63
   macro avg       0.84      0.85      0.83        63
weighted avg       0.86      0.83      0.82        63

0.8253968253968254


# Support Vector Machine

In [400]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import model_selection, naive_bayes, svm

In [401]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=0.8, kernel='linear', degree=100, gamma='auto')
SVM.fit(X_train, y_train)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score ->  85.71428571428571


In [402]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [403]:
y_pred = classifier.predict(X_test)

In [405]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

[[ 9  2  4  0]
 [ 0 19  3  0]
 [ 1  5  6  0]
 [ 1  5  1  7]]
              precision    recall  f1-score   support

     atheism       0.82      0.60      0.69        15
     forsale       0.61      0.86      0.72        22
    religion       0.43      0.50      0.46        12
     windows       1.00      0.50      0.67        14

    accuracy                           0.65        63
   macro avg       0.71      0.62      0.63        63
weighted avg       0.71      0.65      0.65        63

0.6507936507936508


# add n-gram

In [255]:
t = train_data['text_stem']

KeyError: 'text_stem'

In [None]:
(pd.Series(nltk.ngrams(t, 2)).value_counts())[:5]

In [None]:
t = train_data['text_stem']
text = []
for i in range(len(t)):
    text.append(t[i])

print(text)



In [None]:
ngrams = {}
chars = 2

for i in range(len(text)-chars):
    seq = text[i:i+chars]
    print(seq)
    if seq not in ngrams.keys():
        ngrams[seq] = []
    ngrams[seq].append(text[i+chars])