In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os 
import numpy as np
import glob
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [22]:
dirs = [files for files in glob.glob('./Generated CSV/*.csv')]

In [24]:
text = []
target = []

for files in dirs:
    df = pd.read_csv(files,delimiter=',')
    check = df['Conversation'].isnull()
    for index,lines in enumerate(df['Conversation']):
        if not check[index]:
            text.append(lines)
            target.append(df['Class'][index])

In [33]:
text_clf = Pipeline([
    ('vect', CountVectorizer(analyzer='char',ngram_range=(2, 6))),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB())])
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', LogisticRegression(random_state=0, solver='lbfgs',
#                          multi_class='multinomial')),
# ])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.2,shuffle=False)

In [35]:
porter = PorterStemmer()
stemmed = []

for index,sentence in enumerate(X_train):
    tokens = word_tokenize(sentence)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    X_train[index] = ' '.join(stemmed)
    
for index,sentence in enumerate(X_test):
    tokens = word_tokenize(sentence)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    X_test[index] = ' '.join(stemmed)

In [36]:
text_clf.fit(X_train,y_train)  

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='char', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 6), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [37]:
docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.38197767145135564

In [38]:
parameters = {
    'vect__ngram_range': [(2, 4), (2, 6),(2,8)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3,1e-1,1e-4),
}

In [39]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [40]:
gs_clf = gs_clf.fit(X_train,y_train)

In [41]:
docs_test = X_test
predicted = gs_clf.predict(docs_test)
np.mean(predicted == y_test)

0.38197767145135564

In [42]:
gs_clf.best_score_ 

0.4043895636907286

In [43]:
gs_clf.cv_results_

{'mean_fit_time': array([0.90783334, 5.39342356, 7.49152374, 1.23249869, 4.50110936,
        7.59686403, 1.42193165, 3.27133622, 9.26128349, 2.23914299,
        2.84855847, 8.81538348, 2.58458743, 2.47575526, 9.24737864,
        3.32564993, 2.49377966, 7.94389138, 3.01807437, 2.78017015,
        9.22432485, 3.19941568, 2.46289482, 3.67421074]),
 'std_fit_time': array([0.15089599, 1.1243999 , 1.50523641, 0.19513954, 0.94192728,
        1.14868395, 0.25619692, 0.6581179 , 0.52376997, 0.84474123,
        0.3402822 , 0.3130512 , 1.05558676, 0.05454763, 0.42408988,
        0.66985565, 0.11847266, 0.83521623, 1.03135604, 0.68846208,
        0.42372308, 0.85272889, 0.06714076, 0.27654367]),
 'mean_score_time': array([0.22592731, 1.20459747, 1.10476041, 0.24964366, 1.41683316,
        0.62184691, 0.27492619, 1.31884351, 0.77840309, 0.27220449,
        0.98690753, 0.63121343, 0.36302853, 0.70005174, 0.7580822 ,
        0.58578777, 0.41819682, 1.30005102, 0.77125754, 0.82384906,
        0.764797

In [44]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (2, 6)}