In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import matplotlib.pyplot as plt

from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer

from sklearn.svm import LinearSVC, SVC

from sklearn.feature_selection import SelectKBest, chi2

In [2]:
df = pd.read_csv("final_dataset.csv")

- Divide Training - Test (70-30)
- Encoding test:
                 0: negative
                 1: neutral
                 2: positive

In [3]:
df.head()

Unnamed: 0,original_text,preprocessed_text,length_text,not_tag_text,airline,airline_sentiment,negative_reason
0,@VirginAmerica What @dhepburn said.,@mention @mention say,22,say,Virgin America,neutral,
1,@VirginAmerica plus you've added commercials t...,@mention plus add commercial experience tacky,46,plus add commercial experience tacky,Virgin America,positive,
2,@VirginAmerica I didn't today... Must mean I n...,@mention today must mean need take another trip,48,today must mean need take another trip,Virgin America,neutral,
3,@VirginAmerica it's really aggressive to blast...,@mention really aggressive blast obnoxious ent...,88,really aggressive blast obnoxious entertainme...,Virgin America,negative,Bad Flight
4,@VirginAmerica and it's a really big bad thing...,@mention really big bad thing,30,really big bad thing,Virgin America,negative,Can't Tell


In [4]:
encoder = LabelEncoder()
X = df['preprocessed_text']
y = encoder.fit_transform(df["airline_sentiment"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1000, stratify=y)

In [5]:
len(X_train),len(y_train),len(X_test),len(y_test)

(10248, 10248, 4392, 4392)

In [6]:
def model_evaluation(real_v, pred_v):
    print(f"Accuracy sore: {accuracy_score(real_v, pred_v)}")
    print("Classification report:")
    print(classification_report(real_v, pred_v))
    cm = confusion_matrix(real_v, pred_v)
    print (f"Confusion matrix \n {cm}")

## 1 - Vectorization
- Fit to learn a vocabulary dictionary of all tokens in the raw documents
- Transform documents to document-term matrix
- Extract token counts out of raw text documents using the vocabulary fitted with fit or the one provided to the constructor

In [7]:
vect = CountVectorizer(min_df = 5, ngram_range = (1,2))

In [8]:
print(vect)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=5,
                ngram_range=(1, 2), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [9]:
vect.fit(X_train)
X_train_vect = vect.transform(X_train)
print(X_train_vect)

  (0, 59)	1
  (0, 446)	1
  (0, 1107)	1
  (0, 1878)	1
  (0, 2209)	1
  (0, 2576)	1
  (0, 2598)	1
  (0, 2647)	1
  (0, 2794)	1
  (0, 3255)	1
  (0, 3516)	1
  (0, 3704)	1
  (0, 3761)	1
  (0, 3769)	1
  (0, 3827)	1
  (1, 1366)	1
  (1, 1380)	1
  (1, 1878)	2
  (1, 3611)	1
  (1, 3614)	1
  (2, 1878)	1
  (2, 2235)	1
  (2, 3713)	1
  (3, 667)	1
  (3, 674)	1
  :	:
  (10246, 383)	1
  (10246, 627)	1
  (10246, 971)	1
  (10246, 972)	1
  (10246, 1085)	1
  (10246, 1366)	1
  (10246, 1375)	1
  (10246, 1398)	1
  (10246, 1878)	1
  (10246, 2365)	1
  (10246, 2375)	1
  (10246, 2434)	2
  (10246, 2449)	1
  (10246, 2492)	1
  (10246, 2598)	1
  (10246, 2599)	1
  (10247, 125)	1
  (10247, 562)	1
  (10247, 1878)	1
  (10247, 1949)	1
  (10247, 2721)	1
  (10247, 2733)	1
  (10247, 3157)	1
  (10247, 3611)	1
  (10247, 3759)	1


In [10]:
X_test_vect =vect.transform(X_test)
print(X_test_vect)

  (0, 191)	1
  (0, 446)	1
  (0, 449)	1
  (0, 971)	1
  (0, 978)	1
  (0, 1068)	1
  (0, 1366)	1
  (0, 1398)	1
  (0, 1405)	1
  (0, 1878)	1
  (0, 2094)	1
  (0, 2365)	2
  (0, 2370)	1
  (0, 2375)	1
  (0, 2655)	1
  (0, 3200)	1
  (0, 3555)	1
  (1, 344)	1
  (1, 345)	1
  (1, 481)	2
  (1, 814)	1
  (1, 1172)	1
  (1, 1336)	1
  (1, 1874)	1
  (1, 1878)	1
  :	:
  (4389, 1993)	1
  (4389, 3327)	1
  (4389, 3588)	1
  (4390, 803)	2
  (4390, 804)	1
  (4390, 1349)	1
  (4390, 1635)	1
  (4390, 1878)	1
  (4390, 2018)	1
  (4390, 3377)	1
  (4391, 55)	1
  (4391, 794)	1
  (4391, 978)	1
  (4391, 1035)	1
  (4391, 1262)	1
  (4391, 1878)	1
  (4391, 2434)	2
  (4391, 2567)	1
  (4391, 2844)	1
  (4391, 2895)	1
  (4391, 2914)	1
  (4391, 3187)	1
  (4391, 3605)	1
  (4391, 3719)	1
  (4391, 3720)	1


- Array mapping from feature integer indices to feature name

In [11]:
vect.get_feature_names()[13]

'acceptable'

## 2 - Example of simple SVM
Linear SVC with default parameters

In [12]:
learner = LinearSVC()
classifier = learner.fit(X_train_vect, y_train)
predictions = classifier.predict(X_test_vect)



In [13]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7556921675774135
Classification report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84      2753
           1       0.56      0.54      0.55       930
           2       0.70      0.65      0.67       709

    accuracy                           0.76      4392
   macro avg       0.69      0.68      0.69      4392
weighted avg       0.75      0.76      0.75      4392

Confusion matrix 
 [[2359  283  111]
 [ 340  499   91]
 [ 132  116  461]]


## 3 - Example of SVM with TF-IDF
TF-IDF for feature extraction <br>
Linear SVC with default parameters 

In [14]:
vectorizer = TfidfVectorizer(min_df=5,ngram_range=(1, 2))
vectorizer.fit(X_train)
training_features = vectorizer.transform(X_train)
test_features =vectorizer.transform(X_test)

learner = LinearSVC()
classifier = learner.fit(training_features, y_train)
predictions = classifier.predict(test_features)

In [15]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7750455373406193
Classification report:
              precision    recall  f1-score   support

           0       0.82      0.90      0.86      2753
           1       0.61      0.49      0.55       930
           2       0.75      0.65      0.70       709

    accuracy                           0.78      4392
   macro avg       0.73      0.68      0.70      4392
weighted avg       0.77      0.78      0.77      4392

Confusion matrix 
 [[2481  197   75]
 [ 390  459   81]
 [ 153   92  464]]


## 3 - Example of SVM with chi2
Use chi2 for feature selection: select features according to k highest score   <br>
Linear SVC with default parameters 

In [16]:
sel = SelectKBest(chi2, k=500)  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

svm = LinearSVC()
svm_clf = svm.fit(X_train_sel,y_train)
predictions = svm_clf.predict(X_test_sel)



In [17]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7661657559198543
Classification report:
              precision    recall  f1-score   support

           0       0.81      0.90      0.86      2753
           1       0.60      0.48      0.53       930
           2       0.73      0.62      0.67       709

    accuracy                           0.77      4392
   macro avg       0.71      0.67      0.69      4392
weighted avg       0.75      0.77      0.76      4392

Confusion matrix 
 [[2481  193   79]
 [ 404  443   83]
 [ 162  106  441]]


## 3 - Example of SVM with chi2 + TFIDF
Use chi2 for feature selection: select features according to k highest score   <br>
TFIDF transformer <br>
Linear SVC with default parameters 

In [18]:
sel = SelectKBest(chi2, k=500)  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = LinearSVC()  # linear svm with default parameters
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [19]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7650273224043715
Classification report:
              precision    recall  f1-score   support

           0       0.81      0.91      0.86      2753
           1       0.61      0.45      0.52       930
           2       0.72      0.63      0.67       709

    accuracy                           0.77      4392
   macro avg       0.71      0.66      0.68      4392
weighted avg       0.75      0.77      0.75      4392

Confusion matrix 
 [[2498  176   79]
 [ 419  416   95]
 [ 170   93  446]]


GridSearch function

In [20]:
def builtGridSearch(X_train, X_test, y_train, y_test, scores, model, tuned_parameters):
    optimals = {}
    for score in scores:
        print("------- Score = " + str(score) + " ------- \n")
        
        k_fold = StratifiedKFold(n_splits=3, random_state=42)
        print("> Fold = " + str(k_fold) + "\n")
        
        clf = GridSearchCV(model, tuned_parameters, error_score='raise', cv=k_fold, scoring = score, return_train_score=True)

        clf.fit(X_train, y_train)

        print("> Best Parameter set: \n")
        best = clf.best_params_
        print(best)
        
        print("\n> Grid scores:\n")
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        
        print("...........RESULTS FOR TRAINING.........")
        print("........................................")
        means = clf.cv_results_['mean_train_score']
        stds = clf.cv_results_['std_train_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))        
        
        print("____________________________________________")
        
        optimals[score] = best
    return optimals

## 4 - LinearSVC + gridsearch tuning 

In [21]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("svc", LinearSVC())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 800, 'svc__C': 1}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.554 (+/-0.008) for {'selbestk__k': 200, 'svc__C': 0.01}
0.646 (+/-0.038) for {'selbestk__k': 200, 'svc__C': 0.1}
0.684 (+/-0.000) for {'selbestk__k': 200, 'svc__C': 1}
0.685 (+/-0.001) for {'selbestk__k': 200, 'svc__C': 10}
0.683 (+/-0.009) for {'selbestk__k': 200, 'svc__C': 100}
0.546 (+/-0.009) for {'selbestk__k': 300, 'svc__C': 0.01}
0.696 (+/-0.006) for {'selbestk__k': 300, 'svc__C': 0.1}
0.716 (+/-0.002) for {'selbestk__k': 300, 'svc__C': 1}
0.719 (+/-0.003) for {'selbestk__k': 300, 'svc__C': 10}
0.718 (+/-0.006) for {'selbestk__k': 300, 'svc__C': 100}
0.529 (+/-0.009) for {'selbestk__k': 500, 'svc__C': 0.01}
0.719 (+/-0.005) for {'selbestk__k': 500, 'svc__C': 0.1}
0.749 (+/-0.008) for {'selbestk__k': 500, 'svc__C': 1}
0.754 (+/-0.010) for {'selbestk__k': 500, 'svc__C': 10}
0.753 (+/-0.011) for {'selbestk__k': 500, 'svc__C':



In [22]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = LinearSVC(C=optimals["f1_macro"]["svc__C"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [23]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7773224043715847
Classification report:
              precision    recall  f1-score   support

           0       0.82      0.91      0.86      2753
           1       0.63      0.49      0.55       930
           2       0.75      0.65      0.69       709

    accuracy                           0.78      4392
   macro avg       0.73      0.68      0.70      4392
weighted avg       0.77      0.78      0.77      4392

Confusion matrix 
 [[2499  177   77]
 [ 396  456   78]
 [ 157   93  459]]


In [24]:
optimals

{'f1_macro': {'selbestk__k': 800, 'svc__C': 1}}

## 4 - SVC + gridsearch tuning 

In [25]:
scores = ["f1_macro"]

params = {"selbestk__k": [200, 300, 500, 800, 1000],
          "svc__C": [.01, .1, 1, 10, 100]}

clf = Pipeline([
                ("selbestk", SelectKBest(score_func = chi2)),
                ("tfidf", TfidfTransformer()),
                ("svc", SVC())
                ])

optimals = builtGridSearch(X_train_vect, X_test_vect, y_train, y_test, scores, clf, params)

------- Score = f1_macro ------- 

> Fold = StratifiedKFold(n_splits=3, random_state=42, shuffle=False)





> Best Parameter set: 

{'selbestk__k': 1000, 'svc__C': 1}

> Grid scores:

...........RESULTS FOR TRAINING.........
........................................
0.300 (+/-0.005) for {'selbestk__k': 200, 'svc__C': 0.01}
0.581 (+/-0.048) for {'selbestk__k': 200, 'svc__C': 0.1}
0.729 (+/-0.010) for {'selbestk__k': 200, 'svc__C': 1}
0.831 (+/-0.014) for {'selbestk__k': 200, 'svc__C': 10}
0.843 (+/-0.017) for {'selbestk__k': 200, 'svc__C': 100}
0.285 (+/-0.040) for {'selbestk__k': 300, 'svc__C': 0.01}
0.562 (+/-0.007) for {'selbestk__k': 300, 'svc__C': 0.1}
0.771 (+/-0.011) for {'selbestk__k': 300, 'svc__C': 1}
0.885 (+/-0.012) for {'selbestk__k': 300, 'svc__C': 10}
0.894 (+/-0.010) for {'selbestk__k': 300, 'svc__C': 100}
0.257 (+/-0.000) for {'selbestk__k': 500, 'svc__C': 0.01}
0.497 (+/-0.037) for {'selbestk__k': 500, 'svc__C': 0.1}
0.816 (+/-0.009) for {'selbestk__k': 500, 'svc__C': 1}
0.937 (+/-0.008) for {'selbestk__k': 500, 'svc__C': 10}
0.940 (+/-0.009) for {'selbestk__k': 500, 'svc__C'

In [26]:
sel = SelectKBest(chi2, k=optimals["f1_macro"]["selbestk__k"])  
sel.fit(X_train_vect,y_train)
X_train_sel = sel.transform(X_train_vect)
X_test_sel = sel.transform(X_test_vect)

tfidf = TfidfTransformer()
tfidf.fit(X_train_sel)
X_train_vec_bestK = tfidf.transform(X_train_sel)
X_test_vec_bestK =tfidf.transform(X_test_sel)

learner = SVC(C = optimals["f1_macro"]["svc__C"])
classifier = learner.fit(X_train_vec_bestK, y_train)
predictions = classifier.predict(X_test_vec_bestK)

In [27]:
model_evaluation(y_test, predictions)

Accuracy sore: 0.7766393442622951
Classification report:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86      2753
           1       0.66      0.46      0.54       930
           2       0.76      0.62      0.68       709

    accuracy                           0.78      4392
   macro avg       0.74      0.67      0.70      4392
weighted avg       0.77      0.78      0.76      4392

Confusion matrix 
 [[2543  143   67]
 [ 431  430   69]
 [ 194   77  438]]
