In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

from sklearn.svm import LinearSVC, SVC

from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv("final_dataset.csv")

In [3]:
df = df[df["airline_sentiment"]!="neutral"].copy()

- Divide Training - Test (70-30)
- Encoding test:
                 0: negative
                 1: neutral
                 2: positive

In [4]:
df.head()

Unnamed: 0,original_text,preprocessed_text,length_text,not_tag_text,airline,airline_sentiment,negative_reason
1,@VirginAmerica plus you've added commercials t...,@mention plus add commercial experience tacky,46,plus add commercial experience tacky,Virgin America,positive,
3,@VirginAmerica it's really aggressive to blast...,@mention really aggressive blast obnoxious ent...,88,really aggressive blast obnoxious entertainme...,Virgin America,negative,Bad Flight
4,@VirginAmerica and it's a really big bad thing...,@mention really big bad thing,30,really big bad thing,Virgin America,negative,Can't Tell
5,@VirginAmerica seriously would pay $30 a fligh...,@mention seriously would pay @number flight se...,78,seriously would pay flight seat play really ...,Virgin America,negative,Can't Tell
6,"@VirginAmerica yes, nearly every time I fly VX...",@mention yes nearly every time fly vx @emoji w...,65,yes nearly every time fly vx worm go away,Virgin America,positive,


In [5]:
encoder = LabelEncoder()
X = df['preprocessed_text']
y = encoder.fit_transform(df["airline_sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000, stratify=y)

In [6]:
len(X_train),len(y_train),len(X_test),len(y_test)

(8078, 8078, 3463, 3463)

In [7]:
def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## 1 - Vectorization
- Fit to learn a vocabulary dictionary of all tokens in the raw documents
- Transform documents to document-term matrix
- Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor

In [8]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2))

In [9]:
print(vect)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=5,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [10]:
vect.fit(X_train)
X_train_vect = vect.transform(X_train)
print(X_train_vect)

  (0, 193)	1
  (0, 196)	1
  (0, 1122)	1
  (0, 1322)	1
  (0, 1374)	1
  (0, 1484)	1
  (0, 1488)	1
  (0, 1489)	1
  (0, 1550)	1
  (0, 1567)	1
  (0, 1837)	1
  (0, 2034)	1
  (0, 2095)	1
  (0, 2861)	1
  (0, 2961)	1
  (0, 3093)	1
  (1, 196)	1
  (1, 205)	1
  (1, 1158)	1
  (1, 1488)	1
  (1, 1489)	1
  (1, 1567)	1
  (1, 1738)	1
  (1, 2034)	2
  (1, 2274)	1
  :	:
  (8076, 324)	1
  (8076, 818)	1
  (8076, 856)	1
  (8076, 1067)	1
  (8076, 1070)	1
  (8076, 1133)	1
  (8076, 1199)	1
  (8076, 1202)	1
  (8076, 1401)	1
  (8076, 1432)	1
  (8076, 1567)	2
  (8076, 1748)	1
  (8076, 1866)	1
  (8076, 2930)	1
  (8076, 3080)	1
  (8076, 3083)	1
  (8076, 3185)	1
  (8077, 333)	1
  (8077, 805)	1
  (8077, 818)	1
  (8077, 1432)	1
  (8077, 1567)	1
  (8077, 1613)	1
  (8077, 2385)	1
  (8077, 3107)	1


In [11]:
X_test_vect =vect.transform(X_test)
print(X_test_vect)

  (0, 1362)	1
  (0, 1484)	1
  (0, 1567)	1
  (0, 1752)	1
  (0, 1962)	1
  (0, 1966)	1
  (0, 1980)	1
  (0, 2029)	1
  (0, 2185)	1
  (0, 2287)	1
  (1, 53)	1
  (1, 395)	1
  (1, 1067)	1
  (1, 1567)	2
  (1, 1748)	1
  (1, 2034)	2
  (1, 2511)	1
  (1, 2512)	1
  (1, 3060)	1
  (1, 3116)	1
  (1, 3140)	1
  (2, 186)	1
  (2, 422)	1
  (2, 433)	1
  (2, 434)	1
  :	:
  (3460, 1852)	1
  (3460, 2167)	1
  (3460, 2962)	1
  (3461, 44)	1
  (3461, 512)	1
  (3461, 517)	1
  (3461, 547)	1
  (3461, 707)	1
  (3461, 1332)	1
  (3461, 1567)	1
  (3461, 1633)	1
  (3461, 1992)	1
  (3461, 2198)	1
  (3461, 2517)	1
  (3461, 2589)	1
  (3461, 2590)	1
  (3461, 3080)	1
  (3462, 395)	1
  (3462, 1277)	1
  (3462, 1567)	1
  (3462, 1765)	1
  (3462, 2161)	1
  (3462, 2162)	1
  (3462, 2831)	1
  (3462, 2843)	1


- Array mapping from feature integer indices to feature name

In [12]:
vect.get_feature_names()[13]

'access'

## 2 - Example of simple SVM
Linear SVC with default parameters

In [13]:
learner = LinearSVC()
classifier = learner.fit(X_train_vect, y_train)
predictions = classifier.predict(X_test_vect)



In [14]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9015304649148137
Classification report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      2754
           1       0.77      0.75      0.76       709

    accuracy                           0.90      3463
   macro avg       0.85      0.84      0.85      3463
weighted avg       0.90      0.90      0.90      3463

Confusion matrix 
 [[2591  163]
 [ 178  531]]


## 3 - Example of SVM with TF-IDF
TF-IDF for feature extraction <br>
Linear SVC with default parameters 

In [15]:
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 2))
vectorizer.fit(X_train)
training_features = vectorizer.transform(X_train)
test_features =vectorizer.transform(X_test)

learner = LinearSVC()
classifier = learner.fit(training_features, y_train)
predictions = classifier.predict(test_features)

In [16]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9127923765521224
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      2754
           1       0.83      0.73      0.77       709

    accuracy                           0.91      3463
   macro avg       0.88      0.84      0.86      3463
weighted avg       0.91      0.91      0.91      3463

Confusion matrix 
 [[2645  109]
 [ 193  516]]


## 3 - Example of SVM with chi2
Use chi2 for feature selection: select features according to k highest score   <br>
Linear SVC with default parameters 

In [17]:
sel = SelectKBest(chi2, k=500)  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

svm = LinearSVC()
svm_clf = svm.fit(X_train_sel,y_train)
predictions = svm_clf.predict(X_test_sel)



In [18]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9087496390412937
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94      2754
           1       0.83      0.69      0.76       709

    accuracy                           0.91      3463
   macro avg       0.88      0.83      0.85      3463
weighted avg       0.91      0.91      0.91      3463

Confusion matrix 
 [[2655   99]
 [ 217  492]]


## 3 - Example of SVM with chi2 + TFIDF
Use chi2 for feature selection: select features according to k highest score   <br>
TFIDF transformer <br>
Linear SVC with default parameters 

In [19]:
sel = SelectKBest(chi2, k=500)  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = LinearSVC()  # linear svm with default parameters
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [20]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9067282702858793
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94      2754
           1       0.82      0.70      0.75       709

    accuracy                           0.91      3463
   macro avg       0.87      0.83      0.85      3463
weighted avg       0.90      0.91      0.90      3463

Confusion matrix 
 [[2645  109]
 [ 214  495]]


GridSearch function

In [21]:
def builtGridSearch(X_train, X_test, y_train, y_test, scores, model, tuned_parameters):
    optimals = {}
    for score in scores:
        print("------- Score = " + str(score) + " ------- \n")
        
        k_fold = StratifiedKFold(n_splits=3, random_state=42)
        print("> Fold = " + str(k_fold) + "\n")
        
        clf = GridSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = score, return_train_score=True)

        clf.fit(X_train, y_train)

        print("> Best Parameter set: \n")
        best = clf.best_params_
        print(best)
        
        print("\n> Grid scores:\n")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        print("...........RESULTS FOR TRAINING.........")
        print("........................................")
        means = clf.cv_results_['mean_train_score']
        stds = clf.cv_results_['std_train_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))        
        
        print("____________________________________________")
        
        optimals[score] = best
    return optimals

## 4 - LinearSVC + gridsearch tuning 

In [22]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("svc", LinearSVC())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)



------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 1000, 'svc__C': 1}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.728 (+/-0.007) for {'selbestk__k': 200, 'svc__C': 0.01}
0.846 (+/-0.007) for {'selbestk__k': 200, 'svc__C': 0.1}
0.863 (+/-0.007) for {'selbestk__k': 200, 'svc__C': 1}
0.863 (+/-0.008) for {'selbestk__k': 200, 'svc__C': 10}
0.866 (+/-0.008) for {'selbestk__k': 200, 'svc__C': 100}
0.714 (+/-0.011) for {'selbestk__k': 300, 'svc__C': 0.01}
0.856 (+/-0.005) for {'selbestk__k': 300, 'svc__C': 0.1}
0.880 (+/-0.005) for {'selbestk__k': 300, 'svc__C': 1}
0.882 (+/-0.006) for {'selbestk__k': 300, 'svc__C': 10}
0.882 (+/-0.007) for {'selbestk__k': 300, 'svc__C': 100}
0.691 (+/-0.008) for {'selbestk__k': 500, 'svc__C': 0.01}
0.865 (+/-0.005) for {'selbestk__k': 500, 'svc__C': 0.1}
0.903 (+/-0.008) for {'selbestk__k': 500, 'svc__C': 1}
0.907 (+/-0.004) for {'selbestk__k': 500, 'svc__C': 10}
0.907 (+/-0.004) for {'selbestk__k': 500, 'svc__C'

In [23]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = LinearSVC(C=optimals["f1_macro"]["svc__C"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [24]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9156800462027144
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      2754
           1       0.84      0.73      0.78       709

    accuracy                           0.92      3463
   macro avg       0.89      0.85      0.86      3463
weighted avg       0.91      0.92      0.91      3463

Confusion matrix 
 [[2656   98]
 [ 194  515]]


In [25]:
optimals

{'f1_macro': {'selbestk__k': 1000, 'svc__C': 1}}

## 4 - SVC + gridsearch tuning 

In [26]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("svc", SVC())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 1000, 'svc__C': 10}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.512 (+/-0.005) for {'selbestk__k': 200, 'svc__C': 0.01}
0.760 (+/-0.008) for {'selbestk__k': 200, 'svc__C': 0.1}
0.892 (+/-0.009) for {'selbestk__k': 200, 'svc__C': 1}
0.936 (+/-0.008) for {'selbestk__k': 200, 'svc__C': 10}
0.940 (+/-0.009) for {'selbestk__k': 200, 'svc__C': 100}
0.443 (+/-0.000) for {'selbestk__k': 300, 'svc__C': 0.01}
0.724 (+/-0.012) for {'selbestk__k': 300, 'svc__C': 0.1}
0.914 (+/-0.003) for {'selbestk__k': 300, 'svc__C': 1}
0.959 (+/-0.003) for {'selbestk__k': 300, 'svc__C': 10}
0.962 (+/-0.004) for {'selbestk__k': 300, 'svc__C': 100}
0.443 (+/-0.000) for {'selbestk__k': 500, 'svc__C': 0.01}
0.674 (+/-0.018) for {'selbestk__k': 500, 'svc__C': 0.1}
0.936 (+/-0.004) for {'selbestk__k': 500, 'svc__C': 1}
0.982 (+/-0.002) for {'selbestk__k': 500, 'svc__C': 10}
0.983 (+/-0.001) for {'selbestk__k': 500, 'svc__C

In [27]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = SVC(C = optimals["f1_macro"]["svc__C"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [28]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.9127923765521224
Classification report:
              precision    recall  f1-score   support

           0       0.93      0.97      0.95      2754
           1       0.84      0.71      0.77       709

    accuracy                           0.91      3463
   macro avg       0.88      0.84      0.86      3463
weighted avg       0.91      0.91      0.91      3463

Confusion matrix 
 [[2658   96]
 [ 206  503]]
