In [162]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import os 
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize

In [128]:
dirs = ['./Generated CSV/'+files for files in os.listdir('./Generated CSV/')]

In [129]:
text = []
target = []

for files in dirs:
    df = pd.read_csv(files,delimiter=',')
    check = df['Conversation'].isnull()
    
    for index,lines in enumerate(df['Conversation']):
        if not check[index]:
            text.append(lines)
            target.append(df['Class'][index])

In [163]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=5, tol=None)),
])
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB())])
# text_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', LogisticRegression(random_state=0, solver='lbfgs',
#                          multi_class='multinomial')),
# ])

In [164]:
X_train, X_test, y_train, y_test = train_test_split(text, target, test_size=0.2,shuffle=False)

In [165]:
porter = PorterStemmer()
stemmed = []

for index,sentence in enumerate(X_train):
    tokens = word_tokenize(sentence)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    X_train[index] = ' '.join(stemmed)
    
for index,sentence in enumerate(X_test):
    tokens = word_tokenize(sentence)
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    X_train[index] = ' '.join(stemmed)

In [166]:
text_clf.fit(X_train,y_train)  

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                ('clf',
                 SGDClassifier(alpha=0.001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                      

In [167]:
docs_test = X_test
predicted = text_clf.predict(docs_test)
np.mean(predicted == y_test)

0.3827751196172249

In [168]:
parameters = {
    'vect__ngram_range': [(1, 1), (1, 2),(2,2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3,1e-1,1e-4),
}

In [169]:
gs_clf = GridSearchCV(text_clf, parameters, cv=5, iid=False, n_jobs=-1)

In [170]:
gs_clf = gs_clf.fit(X_train,y_train)

In [171]:
docs_test = X_test
predicted = gs_clf.predict(docs_test)
np.mean(predicted == y_test)

0.3831738437001595

In [172]:
gs_clf.best_score_ 

0.40279519299327415