In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer


from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, chi2

In [3]:
df = pd.read_csv("DataSet/final_dataset.csv")

- Divide Training - Test (70-30)
- Encoding test:
                 0: negative
                 1: neutral
                 2: positive

In [4]:
df = df[df["airline_sentiment"]!="neutral"].copy()

In [5]:
df.head()

Unnamed: 0,original_text,preprocessed_text,length_text,not_tag_text,airline,airline_sentiment,negative_reason
1,@VirginAmerica plus you've added commercials t...,@mention plus add commercial experience tacky,46,plus add commercial experience tacky,Virgin America,positive,
3,@VirginAmerica it's really aggressive to blast...,@mention really aggressive blast obnoxious ent...,88,really aggressive blast obnoxious entertainme...,Virgin America,negative,Bad Flight
4,@VirginAmerica and it's a really big bad thing...,@mention really big bad thing,30,really big bad thing,Virgin America,negative,Can't Tell
5,@VirginAmerica seriously would pay $30 a fligh...,@mention seriously would pay @number flight se...,78,seriously would pay flight seat play really ...,Virgin America,negative,Can't Tell
6,"@VirginAmerica yes, nearly every time I fly VX...",@mention yes nearly every time fly vx @emoji w...,65,yes nearly every time fly vx worm go away,Virgin America,positive,


In [6]:
encoder = LabelEncoder()
X = df['preprocessed_text']
y = encoder.fit_transform(df["airline_sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000, stratify=y)

In [7]:
len(X_train),len(y_train),len(X_test),len(y_test)

(8078, 8078, 3463, 3463)

In [8]:
def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## 1 - Vectorization
- Fit to learn a vocabulary dictionary of all tokens in the raw documents
- Transform documents to document-term matrix
- Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor

In [9]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2))

In [10]:
print(vect)

CountVectorizer(min_df=5, ngram_range=(1, 2))


In [11]:
vect.fit(X_train)
X_train_vect = vect.transform(X_train)
print(X_train_vect)

  (0, 193)	1
  (0, 196)	1
  (0, 1122)	1
  (0, 1322)	1
  (0, 1374)	1
  (0, 1484)	1
  (0, 1488)	1
  (0, 1489)	1
  (0, 1550)	1
  (0, 1567)	1
  (0, 1837)	1
  (0, 2034)	1
  (0, 2095)	1
  (0, 2861)	1
  (0, 2961)	1
  (0, 3093)	1
  (1, 196)	1
  (1, 205)	1
  (1, 1158)	1
  (1, 1488)	1
  (1, 1489)	1
  (1, 1567)	1
  (1, 1738)	1
  (1, 2034)	2
  (1, 2274)	1
  :	:
  (8076, 324)	1
  (8076, 818)	1
  (8076, 856)	1
  (8076, 1067)	1
  (8076, 1070)	1
  (8076, 1133)	1
  (8076, 1199)	1
  (8076, 1202)	1
  (8076, 1401)	1
  (8076, 1432)	1
  (8076, 1567)	2
  (8076, 1748)	1
  (8076, 1866)	1
  (8076, 2930)	1
  (8076, 3080)	1
  (8076, 3083)	1
  (8076, 3185)	1
  (8077, 333)	1
  (8077, 805)	1
  (8077, 818)	1
  (8077, 1432)	1
  (8077, 1567)	1
  (8077, 1613)	1
  (8077, 2385)	1
  (8077, 3107)	1


In [12]:
X_test_vect =vect.transform(X_test)
print(X_test_vect)

  (0, 1362)	1
  (0, 1484)	1
  (0, 1567)	1
  (0, 1752)	1
  (0, 1962)	1
  (0, 1966)	1
  (0, 1980)	1
  (0, 2029)	1
  (0, 2185)	1
  (0, 2287)	1
  (1, 53)	1
  (1, 395)	1
  (1, 1067)	1
  (1, 1567)	2
  (1, 1748)	1
  (1, 2034)	2
  (1, 2511)	1
  (1, 2512)	1
  (1, 3060)	1
  (1, 3116)	1
  (1, 3140)	1
  (2, 186)	1
  (2, 422)	1
  (2, 433)	1
  (2, 434)	1
  :	:
  (3460, 1852)	1
  (3460, 2167)	1
  (3460, 2962)	1
  (3461, 44)	1
  (3461, 512)	1
  (3461, 517)	1
  (3461, 547)	1
  (3461, 707)	1
  (3461, 1332)	1
  (3461, 1567)	1
  (3461, 1633)	1
  (3461, 1992)	1
  (3461, 2198)	1
  (3461, 2517)	1
  (3461, 2589)	1
  (3461, 2590)	1
  (3461, 3080)	1
  (3462, 395)	1
  (3462, 1277)	1
  (3462, 1567)	1
  (3462, 1765)	1
  (3462, 2161)	1
  (3462, 2162)	1
  (3462, 2831)	1
  (3462, 2843)	1


- Array mapping from feature integer indices to feature name

In [13]:
vect.get_feature_names()[13]

'access'

In [14]:
def builtGridSearch(X_train, X_test, y_train, y_test, scores, model, tuned_parameters):
    optimals = {}
    for score in scores:
        print("------- Score = " + str(score) + " ------- \n")
        
        k_fold = StratifiedKFold(n_splits=3, random_state=42)
        print("> Fold = " + str(k_fold) + "\n")
        
        clf = GridSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = score, return_train_score=True)

        clf.fit(X_train, y_train)

        print("> Best Parameter set: \n")
        best = clf.best_params_
        print(best)
        
        print("\n> Grid scores:\n")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        print("...........RESULTS FOR TRAINING.........")
        print("........................................")
        means = clf.cv_results_['mean_train_score']
        stds = clf.cv_results_['std_train_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))        
        
        print("____________________________________________")
        
        optimals[score] = best
    return optimals

## 2 - NaiveBayes

In [16]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("mlnb", MultinomialNB())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 500}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.847 (+/-0.013) for {'selbestk__k': 200}
0.858 (+/-0.008) for {'selbestk__k': 300}
0.870 (+/-0.006) for {'selbestk__k': 500}
0.876 (+/-0.003) for {'selbestk__k': 800}
0.880 (+/-0.005) for {'selbestk__k': 1000}
____________________________________________


In [17]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = MultinomialNB()
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [18]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.8992203291943401
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      2754
           1       0.84      0.63      0.72       709

    accuracy                           0.90      3463
   macro avg       0.87      0.80      0.83      3463
weighted avg       0.90      0.90      0.89      3463

Confusion matrix 
 [[2666   88]
 [ 261  448]]


## 2 - KNN

In [17]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          'knn__n_neighbors': list(range(2, 30)),
          'knn__weights': ['uniform', 'distance'],
          'knn__metric': ["euclidean", "manhattan", "minkowski"]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("knn", KNeighborsClassifier())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'knn__metric': 'euclidean', 'knn__n_neighbors': 5, 'knn__weights': 'uniform', 'selbestk__k': 200}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.830 (+/-0.066) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 200}
0.850 (+/-0.037) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 300}
0.870 (+/-0.016) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 500}
0.906 (+/-0.010) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 800}
0.891 (+/-0.070) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 1000}
0.899 (+/-0.058) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'distance', 'selbestk__k': 200}
0.937 (+/-0.032) for {'knn__metric': 'euclidean', 'knn__

In [18]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = KNeighborsClassifier(n_neighbors = optimals["f1_macro"]["knn__n_neighbors"], weights = optimals["f1_macro"]["knn__weights"] , metric = optimals["f1_macro"]["knn__metric"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [19]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.8807392434305515
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.93      2754
           1       0.74      0.64      0.69       709

    accuracy                           0.88      3463
   macro avg       0.83      0.79      0.81      3463
weighted avg       0.88      0.88      0.88      3463

Confusion matrix 
 [[2597  157]
 [ 256  453]]


## 3 - Decision Tree

In [20]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          'dt__criterion' : ['gini', 'entropy'],
          'dt__max_depth': [None, 2, 5, 10, 15, 20],
          'dt__min_samples_split': [2, 5, 10, 15, 20],
          'dt__min_samples_leaf': [1, 5, 10, 15, 20]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("dt", DecisionTreeClassifier())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'dt__criterion': 'gini', 'dt__max_depth': 15, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 10, 'selbestk__k': 1000}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.940 (+/-0.010) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 200}
0.962 (+/-0.004) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 300}
0.984 (+/-0.001) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 500}
0.991 (+/-0.001) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 800}
0.993 (+/-0.001) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 1000}
0.929 (+/-0.009) for {'dt__

In [21]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = DecisionTreeClassifier(criterion = optimals["f1_macro"]["dt__criterion"], 
                                 max_depth = optimals["f1_macro"]["dt__max_depth"], 
                                 min_samples_split = optimals["f1_macro"]["dt__min_samples_split"],
                                 min_samples_leaf = optimals["f1_macro"]["dt__min_samples_leaf"]
                                )
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [22]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.875541438059486
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.94      0.92      2754
           1       0.72      0.63      0.68       709

    accuracy                           0.88      3463
   macro avg       0.82      0.79      0.80      3463
weighted avg       0.87      0.88      0.87      3463

Confusion matrix 
 [[2583  171]
 [ 260  449]]
