In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer


from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv("final_dataset.csv")

- Divide Training - Test (70-30)
- Encoding test:
                 0: negative
                 1: neutral
                 2: positive

In [3]:
df.head()

Unnamed: 0,original_text,preprocessed_text,length_text,not_tag_text,airline,airline_sentiment,negative_reason
0,@VirginAmerica What @dhepburn said.,@mention @mention say,22,say,Virgin America,neutral,
1,@VirginAmerica plus you've added commercials t...,@mention plus add commercial experience tacky,46,plus add commercial experience tacky,Virgin America,positive,
2,@VirginAmerica I didn't today... Must mean I n...,@mention today must mean need take another trip,48,today must mean need take another trip,Virgin America,neutral,
3,@VirginAmerica it's really aggressive to blast...,@mention really aggressive blast obnoxious ent...,88,really aggressive blast obnoxious entertainme...,Virgin America,negative,Bad Flight
4,@VirginAmerica and it's a really big bad thing...,@mention really big bad thing,30,really big bad thing,Virgin America,negative,Can't Tell


In [4]:
encoder = LabelEncoder()
X = df['preprocessed_text']
y = encoder.fit_transform(df["airline_sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000, stratify=y)

In [5]:
len(X_train),len(y_train),len(X_test),len(y_test)

(10248, 10248, 4392, 4392)

In [6]:
def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## 1 - Vectorization
- Fit to learn a vocabulary dictionary of all tokens in the raw documents
- Transform documents to document-term matrix
- Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor

In [7]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2))

In [8]:
print(vect)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=5,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [9]:
vect.fit(X_train)
X_train_vect = vect.transform(X_train)
print(X_train_vect)

  (0, 59)	1
  (0, 446)	1
  (0, 1107)	1
  (0, 1878)	1
  (0, 2209)	1
  (0, 2576)	1
  (0, 2598)	1
  (0, 2647)	1
  (0, 2794)	1
  (0, 3255)	1
  (0, 3516)	1
  (0, 3704)	1
  (0, 3761)	1
  (0, 3769)	1
  (0, 3827)	1
  (1, 1366)	1
  (1, 1380)	1
  (1, 1878)	2
  (1, 3611)	1
  (1, 3614)	1
  (2, 1878)	1
  (2, 2235)	1
  (2, 3713)	1
  (3, 667)	1
  (3, 674)	1
  :	:
  (10246, 383)	1
  (10246, 627)	1
  (10246, 971)	1
  (10246, 972)	1
  (10246, 1085)	1
  (10246, 1366)	1
  (10246, 1375)	1
  (10246, 1398)	1
  (10246, 1878)	1
  (10246, 2365)	1
  (10246, 2375)	1
  (10246, 2434)	2
  (10246, 2449)	1
  (10246, 2492)	1
  (10246, 2598)	1
  (10246, 2599)	1
  (10247, 125)	1
  (10247, 562)	1
  (10247, 1878)	1
  (10247, 1949)	1
  (10247, 2721)	1
  (10247, 2733)	1
  (10247, 3157)	1
  (10247, 3611)	1
  (10247, 3759)	1


In [10]:
X_test_vect =vect.transform(X_test)
print(X_test_vect)

  (0, 191)	1
  (0, 446)	1
  (0, 449)	1
  (0, 971)	1
  (0, 978)	1
  (0, 1068)	1
  (0, 1366)	1
  (0, 1398)	1
  (0, 1405)	1
  (0, 1878)	1
  (0, 2094)	1
  (0, 2365)	2
  (0, 2370)	1
  (0, 2375)	1
  (0, 2655)	1
  (0, 3200)	1
  (0, 3555)	1
  (1, 344)	1
  (1, 345)	1
  (1, 481)	2
  (1, 814)	1
  (1, 1172)	1
  (1, 1336)	1
  (1, 1874)	1
  (1, 1878)	1
  :	:
  (4389, 1993)	1
  (4389, 3327)	1
  (4389, 3588)	1
  (4390, 803)	2
  (4390, 804)	1
  (4390, 1349)	1
  (4390, 1635)	1
  (4390, 1878)	1
  (4390, 2018)	1
  (4390, 3377)	1
  (4391, 55)	1
  (4391, 794)	1
  (4391, 978)	1
  (4391, 1035)	1
  (4391, 1262)	1
  (4391, 1878)	1
  (4391, 2434)	2
  (4391, 2567)	1
  (4391, 2844)	1
  (4391, 2895)	1
  (4391, 2914)	1
  (4391, 3187)	1
  (4391, 3605)	1
  (4391, 3719)	1
  (4391, 3720)	1


- Array mapping from feature integer indices to feature name

In [11]:
vect.get_feature_names()[13]

'acceptable'

In [12]:
def builtGridSearch(X_train, X_test, y_train, y_test, scores, model, tuned_parameters):
    optimals = {}
    for score in scores:
        print("------- Score = " + str(score) + " ------- \n")
        
        k_fold = StratifiedKFold(n_splits=3, random_state=42)
        print("> Fold = " + str(k_fold) + "\n")
        
        clf = GridSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = score, return_train_score=True)

        clf.fit(X_train, y_train)

        print("> Best Parameter set: \n")
        best = clf.best_params_
        print(best)
        
        print("\n> Grid scores:\n")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        print("...........RESULTS FOR TRAINING.........")
        print("........................................")
        means = clf.cv_results_['mean_train_score']
        stds = clf.cv_results_['std_train_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))        
        
        print("____________________________________________")
        
        optimals[score] = best
    return optimals

## 2 - NaiveBayes

In [13]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("mlnb", MultinomialNB())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 1000}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.624 (+/-0.003) for {'selbestk__k': 200}
0.654 (+/-0.006) for {'selbestk__k': 300}
0.687 (+/-0.003) for {'selbestk__k': 500}
0.716 (+/-0.004) for {'selbestk__k': 800}
0.725 (+/-0.004) for {'selbestk__k': 1000}
____________________________________________


In [14]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = MultinomialNB()
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [15]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7552367941712204
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.95      0.84      2753
           1       0.67      0.33      0.44       930
           2       0.81      0.55      0.66       709

    accuracy                           0.76      4392
   macro avg       0.75      0.61      0.65      4392
weighted avg       0.75      0.76      0.73      4392

Confusion matrix 
 [[2617   89   47]
 [ 576  307   47]
 [ 256   60  393]]


## 2 - KNN

In [16]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          'knn__n_neighbors': list(range(2, 30)),
          'knn__weights': ['uniform', 'distance'],
          'knn__metric': ["euclidean", "manhattan", "minkowski"]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("knn", KNeighborsClassifier())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'knn__metric': 'manhattan', 'knn__n_neighbors': 16, 'knn__weights': 'distance', 'selbestk__k': 200}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.673 (+/-0.026) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 200}
0.708 (+/-0.043) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 300}
0.736 (+/-0.025) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 500}
0.748 (+/-0.008) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 800}
0.746 (+/-0.003) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'uniform', 'selbestk__k': 1000}
0.801 (+/-0.029) for {'knn__metric': 'euclidean', 'knn__n_neighbors': 2, 'knn__weights': 'distance', 'selbestk__k': 200}
0.862 (+/-0.025) for {'knn__metric': 'euclidean', 'knn

In [17]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = KNeighborsClassifier(n_neighbors = optimals["f1_macro"]["knn__n_neighbors"], weights = optimals["f1_macro"]["knn__weights"] , metric = optimals["f1_macro"]["knn__metric"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [18]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7144808743169399
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.82      2753
           1       0.47      0.47      0.47       930
           2       0.70      0.52      0.60       709

    accuracy                           0.71      4392
   macro avg       0.66      0.61      0.63      4392
weighted avg       0.71      0.71      0.71      4392

Confusion matrix 
 [[2325  347   81]
 [ 411  441   78]
 [ 190  147  372]]


## 3 - Decision Tree

In [19]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          'dt__criterion' : ['gini', 'entropy'],
          'dt__max_depth': [None, 2, 5, 10, 15, 20],
          'dt__min_samples_split': [2, 5, 10, 15, 20],
          'dt__min_samples_leaf': [1, 5, 10, 15, 20]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("dt", DecisionTreeClassifier())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 5, 'dt__min_samples_split': 20, 'selbestk__k': 200}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.844 (+/-0.019) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 200}
0.895 (+/-0.010) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 300}
0.940 (+/-0.009) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 500}
0.967 (+/-0.007) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 800}
0.975 (+/-0.004) for {'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2, 'selbestk__k': 1000}
0.824 (+/-0.017) for {'dt_

In [20]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = DecisionTreeClassifier(criterion = optimals["f1_macro"]["dt__criterion"], 
                                 max_depth = optimals["f1_macro"]["dt__max_depth"], 
                                 min_samples_split = optimals["f1_macro"]["dt__min_samples_split"],
                                 min_samples_leaf = optimals["f1_macro"]["dt__min_samples_leaf"]
                                )
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [21]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7040072859744991
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.84      0.81      2753
           1       0.48      0.44      0.46       930
           2       0.63      0.53      0.57       709

    accuracy                           0.70      4392
   macro avg       0.63      0.60      0.62      4392
weighted avg       0.70      0.70      0.70      4392

Confusion matrix 
 [[2305  326  122]
 [ 421  413   96]
 [ 210  125  374]]
