In [2]:
import os
import pandas as pd
import numpy as np
from scipy.stats import randint
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from IPython.display import display
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn import metrics

In [3]:
df = pd.read_csv('IBC.csv', index_col=False)
df

Unnamed: 0,SENTENCE,LABEL
0,Forcing middle-class workers to bear a greater...,1
1,Because it would not be worthwhile to bring a ...,1
2,"Indeed , Lind argues that high profits and hig...",1
3,"In fairness , it should be noted that he devot...",1
4,Psychological tactics are social control techn...,1
...,...,...
4321,"As Doug Ogden , former director of the Energy ...",0
4322,No study is perfect ; each one is subject to c...,0
4323,"Of course , market forces , the balancing of ,...",0
4324,"In Words Like Loaded Pistols , he sets out to ...",0


In [5]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,
                        ngram_range=(1, 2), 
                        stop_words='english')

# We transform each complaint into a vector
features = tfidf.fit_transform(df.SENTENCE).toarray()

labels = df.LABEL

print("Each of the %d sentences is represented by %d features (TF-IDF score of unigrams and bigrams)" %(features.shape))

Each of the 4326 sentences is represented by 3774 features (TF-IDF score of unigrams and bigrams)


In [6]:
X = df['SENTENCE'] # Collection of documents
y = df['LABEL'] # Target or the labels we want to predict (i.e., the 13 different complaints of products)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25,
                                                    random_state = 0)

In [7]:
models = [
    RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

# 5 Cross-validation
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))

entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [8]:
mean_accuracy = cv_df.groupby('model_name').accuracy.mean()
std_accuracy = cv_df.groupby('model_name').accuracy.std()

acc = pd.concat([mean_accuracy, std_accuracy], axis= 1, 
          ignore_index=True)
acc.columns = ['Mean Accuracy', 'Standard deviation']
acc

Unnamed: 0_level_0,Mean Accuracy,Standard deviation
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1
LinearSVC,0.522654,0.007133
LogisticRegression,0.563801,0.009065
MultinomialNB,0.548778,0.012337
RandomForestClassifier,0.474803,0.004081


In [10]:
X_train, X_test, y_train, y_test,indices_train,indices_test = train_test_split(features, 
                                                               labels, 
                                                               df.index, test_size=0.25, 
                                                               random_state=1)
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [15]:
# Classification report
print('\t\t\t\tCLASSIFICATIION METRICS\n')
print(metrics.classification_report(y_test, y_pred, labels=[-1, 0 ,1]))

				CLASSIFICATIION METRICS

              precision    recall  f1-score   support

          -1       0.53      0.54      0.54       430
           0       0.55      0.04      0.08       146
           1       0.58      0.72      0.64       506

    accuracy                           0.56      1082
   macro avg       0.55      0.43      0.42      1082
weighted avg       0.55      0.56      0.52      1082

