In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
data = pd.read_csv('Tweets.csv')

In [10]:
#data.head()

In [15]:
#sentiment_plot = data.groupby(['airline', 'airline_sentiment']).airline_sentiment.count().unstack().plot(kind = 'bar', figsize = (20, 10), fontsize = 12.0, xlabel = ('Airline'), ylabel = 'Sentiments Count', color=['#b30000', 'blue', 'yellow'])

In [16]:
features = data.iloc[:,10]
target = data.iloc[:,1]

In [17]:
#This piece of code will be used to clean the Irrelevant​ features like punctuation,commas and tages.
processed_features = []
for sentence in range(0, len(features)):
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    #remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    #remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    #Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    #removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    
    #ronverting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)
    

In [18]:
#This piece of code can use stop words to remove the meaningless words like the, his, him and she.
corpus = []
nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))
for sentence in processed_features:
    sentence = sentence.lower()
    words = sentence.split(" ")
    
    filtered_words = [word.lower() for word in words if not word in stopwords]
    corpus.append(" ".join(filtered_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ghussain\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1700, min_df=10, max_df=0.75)
processed_features = vectorizer.fit_transform(corpus).toarray()
processed_features.shape

(14640, 1700)

In [32]:
#Split data set into training and testing.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, target, test_size=0.10, random_state=0)

In [33]:
#used Logistic Regression a machine learning algorithm.
from sklearn.linear_model import LogisticRegression
text_classifier = LogisticRegression(random_state = 0, max_iter = 200, solver = 'lbfgs', verbose = True)

text_classifier.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    2.9s finished


LogisticRegression(max_iter=200, random_state=0, verbose=True)

In [34]:
predictions = text_classifier.predict(X_test)

In [35]:
#Evaltation methods include precision,recall,fi-score and accuracy. 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

print(classification_report(y_test, predictions))

print(accuracy_score(y_test, predictions))

[[1733  110   27]
 [ 263  310   41]
 [ 119   61  264]]
              precision    recall  f1-score   support

    negative       0.82      0.93      0.87      1870
     neutral       0.64      0.50      0.57       614
    positive       0.80      0.59      0.68       444

    accuracy                           0.79      2928
   macro avg       0.75      0.68      0.71      2928
weighted avg       0.78      0.79      0.78      2928

0.7879098360655737


In [51]:
#in this section we will use deep learning methods to predict positive,negative and neutral sentiments.
import tensorflow as tf
#tf.__version__

In [52]:
from tensorflow import keras
from tensorflow.keras import layers

In [65]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Dropout, Activation
from tensorflow.keras.layers import Embedding

In [75]:
#Split data set into training and testing.
ycat=pd.get_dummies(target).values
X=features.values
tk = Tokenizer()
tk.fit_on_texts(X)
X_seq = tk.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=100, padding='post')
X_train, X_test, y_train, y_test = train_test_split(X_pad, ycat, test_size = 0.20, random_state = 0)

In [76]:
#Hyperprameters tuning 
vocabulary_size = len(tk.word_counts.keys())+1
max_words = 100
embedding_size = 32
model = Sequential()
model.add(Embedding(vocabulary_size, embedding_size, input_length=max_words))
model.add(Flatten())
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 100, 32)           504608    
                                                                 
 flatten_9 (Flatten)         (None, 3200)              0         
                                                                 
 dense_9 (Dense)             (None, 3)                 9603      
                                                                 
Total params: 514,211
Trainable params: 514,211
Non-trainable params: 0
_________________________________________________________________


In [78]:
history=model.fit(X_train,y_train,validation_data=(X_test,y_test),batch_size=32,epochs=3,verbose=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [80]:
#Results/output
predictions = [np.argmax(i) for i in model.predict(X_test)]
y_test=[np.argmax(i) for i in y_test]

print(confusion_matrix(y_test, predictions))

print(classification_report(y_test,predictions,target_names=['negative','neutral','positive'])),

print(accuracy_score(y_test, predictions))


[[1968  545  415]
 [   0    0    0]
 [   0    0    0]]
              precision    recall  f1-score   support

    negative       1.00      0.67      0.80      2928
     neutral       0.00      0.00      0.00         0
    positive       0.00      0.00      0.00         0

    accuracy                           0.67      2928
   macro avg       0.33      0.22      0.27      2928
weighted avg       1.00      0.67      0.80      2928

0.6721311475409836


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Conclusion:- for small dataset, Machine learning baseline methods can have better results that Deep learning methods. The reson deep learning need huge data.