# **Packages Import**

In [17]:
import re
import nltk
import pandas as pd
from nltk.corpus                         import stopwords
from sklearn.feature_extraction.text     import TfidfVectorizer
from sklearn.model_selection             import train_test_split
from sklearn.neural_network              import MLPClassifier
from sklearn.svm                         import SVC
from sklearn.naive_bayes                 import GaussianNB
from sklearn.metrics                     import confusion_matrix,classification_report

# **Business & Data Understanding**

Extract (E): Load the dataset

In [3]:
df=pd.read_csv("/content/Tweets.csv")
# Display the first few rows of the dataset
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


Display information about the dataset

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 15 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

Display the distribution of labels in the dataset

In [5]:
df.airline_sentiment.value_counts()

Unnamed: 0_level_0,count
airline_sentiment,Unnamed: 1_level_1
negative,9178
neutral,3099
positive,2363


# **Data Preparation**

Extract the features and labels

In [6]:
features=df.iloc[:,10]
labels=df.iloc[:,1]

## **Data Cleaning**

Transform (T): Data Preprocessing
Remove non-alphabetic characters and single characters

In [7]:
tidy_features=[]
for i in range(len(features)):
  tmp=re.sub(r'[^a-zA-Z]',' ',features[i])
  tmp=re.sub(r'\s[a-zA-Z]\s',' ',tmp)
  tmp=re.sub(r'\s+',' ',tmp)
  tmp=tmp.lower()
  tidy_features.append(tmp)

print("******** Before **********")
print(features[0:4])

******** Before **********
0                  @VirginAmerica What @dhepburn said.
1    @VirginAmerica plus you've added commercials t...
2    @VirginAmerica I didn't today... Must mean I n...
3    @VirginAmerica it's really aggressive to blast...
Name: text, dtype: object


In [8]:
print("******** After **********")
print(tidy_features[0:4])

******** After **********
[' virginamerica what dhepburn said ', ' virginamerica plus you ve added commercials to the experience tacky ', ' virginamerica didn today must mean need to take another trip ', ' virginamerica it really aggressive to blast obnoxious entertainment in your guests faces amp they have little recourse']


## **Word Embedding**

In [9]:
nltk.download('stopwords')
# Vectorize the text data using TF-IDF

vectorizer=TfidfVectorizer(max_features=2000,min_df=7,max_df=0.8,stop_words=stopwords.words('english'))
X=vectorizer.fit_transform(tidy_features).toarray()
X

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

## **Data Split**

In [10]:
X_train,X_test,y_train,y_test=train_test_split(X,labels,test_size=0.2)

# **Machine Learning: NB Vs SVM Vs Neural Network**

Load (L): Model Training and Evaluation

In [11]:
gnb=GaussianNB() # gnb is a naive bayes classifier
linear_svm  =SVC(kernel='linear') # linear_svm is a Linear Support Vectors
rbf_svm     =SVC(kernel='rbf')    # rbf_svm is a RBF support vectors
sigmoid_svm =SVC(kernel='sigmoid')# sigmoid support vectors
ploy_svm    =SVC(kernel='poly',degree=2) # Ploynom with degree=2 as support vectors
neural=MLPClassifier(hidden_layer_sizes=(100,20),activation='logistic',solver='adam') # neural is a neural network classification

Train the models

In [12]:
gnb.fit(X_train,y_train) # Train Guassian NB classifier
linear_svm.fit(X_train,y_train) # Train SVM
rbf_svm.fit(X_train,y_train)
sigmoid_svm.fit(X_train,y_train)
ploy_svm.fit(X_train,y_train)
neural.fit(X_train,y_train) # Train Neural Network - finding the best weight matrix



Predict using the models

In [13]:
y_nb=gnb.predict(X_test)
y_linear_svm=linear_svm.predict(X_test)
y_rbf_svm=rbf_svm.predict(X_test)
y_ploy_svm=ploy_svm.predict(X_test)
y_sigmoid_svm=sigmoid_svm.predict(X_test)
y_neural=neural.predict(X_test)

#  **Performance Evaluation**


In [14]:
print ('************* Peformance Evauation of Naive Bayes **************')
print(confusion_matrix(y_test,y_nb))
print(classification_report(y_test,y_nb))
print ('************* Peformance Evauation of Linear SVM **************')
print(confusion_matrix(y_test,y_linear_svm))
print(classification_report(y_test,y_linear_svm))
print ('************* Peformance Evauation of RBF SVM **************')
print(confusion_matrix(y_test,y_rbf_svm))
print(classification_report(y_test,y_rbf_svm))
print ('************* Peformance Evauation of Sigmoid SVM **************')
print(confusion_matrix(y_test,y_sigmoid_svm))
print(classification_report(y_test,y_sigmoid_svm))
print ('************* Peformance Evauation of Polynomial (2) SVM **************')
print(confusion_matrix(y_test,y_ploy_svm))
print(classification_report(y_test,y_ploy_svm))
print ('************* Peformance Evauation of Neural Network **************')
print(confusion_matrix(y_test,y_neural))
print(classification_report(y_test,y_neural))

************* Peformance Evauation of Naive Bayes **************
[[552 480 804]
 [ 46 189 382]
 [ 31  71 373]]
              precision    recall  f1-score   support

    negative       0.88      0.30      0.45      1836
     neutral       0.26      0.31      0.28       617
    positive       0.24      0.79      0.37       475

    accuracy                           0.38      2928
   macro avg       0.46      0.46      0.36      2928
weighted avg       0.64      0.38      0.40      2928

************* Peformance Evauation of Linear SVM **************
[[1683  119   34]
 [ 244  321   52]
 [ 119   58  298]]
              precision    recall  f1-score   support

    negative       0.82      0.92      0.87      1836
     neutral       0.64      0.52      0.58       617
    positive       0.78      0.63      0.69       475

    accuracy                           0.79      2928
   macro avg       0.75      0.69      0.71      2928
weighted avg       0.78      0.79      0.78      2928

********

In [15]:
import pickle

iasria_vect=pickle.dump(vectorizer,open("iasria_vect.pickle",'wb'))
iasria_model=pickle.dump(neural,open("iasria_model.pickle",'wb'))