In [8]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterGrid
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer   # “Term Frequency times Inverse Document Frequency”
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB       # Multinomial Naive Bayes, supposedly goes well with the data from the transformer
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix

In [9]:
df = pd.read_csv('C:\\Users\\maxjp\\PycharmProjects\\complingproject\\MasterDataFile.csv')      # to read the training data into working memory
df['feature'] = df['feature'].replace(to_replace='<.*>', value='', regex=True)

X_train, X_test, y_train, y_test = train_test_split(df['feature'],   # Test-Train Split function
                                                    df['target'],
                                                    train_size = 0.8,
                                                    test_size = 0.2,
                                                    random_state=14

vect = CountVectorizer(min_df=1,          # Minimum Document Frequency
                       ngram_range=(1, 2),   # unigrams to bigrams
                       stop_words='english',   # stopwords
                       lowercase =False,
                       binary = True
                       ).fit(X_train)

X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

X_train_vectorized_tfidf = TfidfTransformer().fit_transform(X_train_vectorized) # or X_train_dense
X_test_vectorized_tfidf = TfidfTransformer().fit_transform(X_test_vectorized) # or X_test_dense

In [10]:
LSVCParam_Grid = ({'penalty': ['l2'], 'loss': ['squared_hinge'], 'dual': [True, False], 'C': [1, 100, 1000, 10000], 
                  'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [1000, 2000, 8000]},
                  {'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge'], 'dual': [True], 'C': [1, 100, 1000, 10000], 
                  'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [1000, 2000, 8000]})

In [11]:
lsvc = LinearSVC()
clf = GridSearchCV(lsvc, LSVCParam_Grid)

In [12]:
clf.fit(X_train_vectorized, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=({'penalty': ['l2'], 'loss': ['squared_hinge'], 'dual': [True, False], 'C': [1, 100, 1000, 10000], 'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [1000, 2000, 8000]}, {'penalty': ['l2'], 'loss': ['hinge', 'squared_hinge'], 'dual': [True], 'C': [1, 100, 1000, 10000], 'multi_class': ['ovr', 'crammer_singer'], 'max_iter': [1000, 2000, 8000]}),
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [13]:
clf.best_score_

0.97125193199381765

In [14]:
clf.best_estimator_

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [15]:
clf.best_params_

{'C': 1,
 'dual': True,
 'loss': 'squared_hinge',
 'max_iter': 1000,
 'multi_class': 'ovr',
 'penalty': 'l2'}

In [16]:
clf.score(X_test_vectorized, y_test)

0.97404202719406674

In [17]:
ourlsvc = LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0).fit(X_train_vectorized, y_train)

In [18]:
y_predict = ourlsvc.predict(X_test_vectorized)
y_test_compare = y_test.as_matrix()

In [None]:
print('LinearSVC with parameters from GridSearchCV')
print('Accuracy: ', accuracy_score(y_test_compare, y_predict))
print('f1: ', f1_score(y_test_compare, y_predict))
print('Precision: ', precision_score(y_test_compare, y_predict))
print('Recall: ', recall_score(y_test_compare, y_predict))
print(classification_report(y_test_compare, y_predict))