# Medium Blog: 
## I have explained the step-by-step process to approach this problem in the below blog. Please do check it out :) 
https://parisrohan.medium.com/twitter-sentiment-analysis-and-classification-7060d4444a27

## 1. Data Collection

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#Load data-preprocessing libraries
import pandas as pd
import numpy as np

#Text processing libraries
import re
import nltk
from nltk.corpus import stopwords  
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer #feature extraction

#Load data-visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

#model building
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

#evaluation metrics
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) 

lemmatizer = WordNetLemmatizer() 

pd.pandas.set_option('display.max_columns',None)

In [None]:
#Load training data
df_data=pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv',header=None)
df_data.head()

In [None]:
#Load test data
df_test=pd.read_csv('/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv',header=None)
df_test.head()

Note:
* It can be observed that the train and test dataset have the same number of columns and the columns do not have a proper naming convention
* We will rename the columns of the dataframe to get a better sense of the data

In [None]:
#Function to rename the columns to get a better sense of data
def rename_dataframe(df):
    df.rename(columns= {0:'Tweet_ID',1:'Topic',2:'Sentiment',3:'Tweet'},inplace=True)

In [None]:
#Rename train data
rename_dataframe(df_data)
df_data.head()

In [None]:
#Rename test data
rename_dataframe(df_test)
df_test.head()

## 2. EDA

In [None]:
#shape of data
print('Train data shape: ',df_data.shape) 
print('Test data shape: ',df_test.shape)

In [None]:
#percentage of missing data
(df_data.isnull().sum()/len(df_data))*100

In [None]:
#As only 0.91% of data in Tweet features are null we will drop them
df_data.dropna(axis=0,inplace=True)

In [None]:
#check percentage of missing data after dropping missing features
(df_data.isnull().sum()/len(df_data))*100

In [None]:
df_data.info()

In [None]:
print('Distinct Values: \n')
col=['Topic','Sentiment']
for i in col:
    print(i,'->')
    print(df_data[i].value_counts())
    print('\n')

In [None]:
#Visualizing Sentiment feature - Target feature
plt.figure(figsize=(15,5))

#plot pie chart
plt.subplot(1,2,1)
label = df_data['Sentiment'].value_counts().index
label_count = df_data['Sentiment'].value_counts().values
plt.pie(data=df_data, x=label_count, labels=label, autopct='%1.1f%%', shadow=True, radius=1.5)

#plot countplot
plt.subplot(1,2,2)
ax = sns.countplot(x='Sentiment', data=df_data, order=label)
for p in ax.patches:
    height=p.get_height()
    ax.text(x=p.get_x()+(p.get_width()/2),
    y=height+0.2,ha='center',s='{:.0f}'.format(height))
plt.show()

Note:
* From the above graphs we can observe that the dataset is balanced.

In [None]:
#Get the count of words in each tweet
df_data['Tweet_word_count']=df_data['Tweet'].apply(lambda x: len(x.split()))

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
sns.boxplot(x=df_data['Tweet_word_count'])
plt.title(('Distribution of number of tokens in tweets'))

plt.subplot(2,1,2)
sns.distplot(x=df_data['Tweet_word_count'])

Note:
* The mean length of tokens is around 23.
* The distribution is positively skewed.

In [None]:
#Extreme outliers
extreme_outliers = df_data['Tweet'][df_data['Tweet_word_count']>125]

for i in extreme_outliers.index:
    print(i,'Tweet Sentiment: ',df_data['Sentiment'][i])
    print(extreme_outliers[i])
    print('\n')

In [None]:
#Get count of characters in each tweet excluding the whitespaces
df_data['Tweet_char_count']=df_data['Tweet'].apply(lambda x: len(x)-x.count(' '))

In [None]:
plt.figure(figsize=(15,10))
plt.subplot(2,1,1)
sns.boxplot(x=df_data['Tweet_char_count'])
plt.title(('Distribution of number of characters in tweets'))

plt.subplot(2,1,2)
sns.distplot(x=df_data['Tweet_char_count'])

## 3. Data preprocessing

In [None]:
#Remove user mentions from the tweets
#'I want to join @google' will become 'I want to join'
df_data['Tweet_clean']=df_data['Tweet'].apply(lambda x: re.sub(r'@[A-Za-z0-9]+','',x))

In [None]:
#Remove hashtags from the tweets
#'My new house #goals' will become 'My new house'
df_data['Tweet_clean']=df_data['Tweet_clean'].apply(lambda x: re.sub('#','',x))

In [None]:
# make a dictionary of contractions
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [None]:
#Remove contractions
df_data['Tweet_clean']=df_data['Tweet_clean'].apply(lambda x: ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in x.split(" ")]))

In [None]:
#Remove links/urls from the tweets
df_data['Tweet_clean']=df_data['Tweet_clean'].apply(lambda x: re.sub(r'http\S+','',x))

In [None]:
def txt_conversion(sentence):
    #Getting only the letters from the tweets
    sentence=re.sub(r'[^a-zA-Z ]','',sentence)
    #Converting them to lowercase
    sentence=sentence.lower()
    #split based on space to remove multiple spaces
    words=sentence.split()
    #combining to form sentence
    return (" ".join(words)).strip()

In [None]:
df_data['Tweet_clean']=df_data['Tweet_clean'].apply(lambda x: txt_conversion(x))

In [None]:
#Removing stop-words and cpnverting words to lemma
#Stop words are the most commonly used words in a language (such as “the”, “a”, “an”, “in”)
#Example After lemmatization words like multiplication, multiplicative will become multiple
def stop_wrds_lemma_convert(sentence):
    tokens = [w for w in sentence.split() if not w in stop_words] #stopwords removal
    newString=''
    for i in tokens:                                                 
        newString=newString+lemmatizer.lemmatize(i)+' '    #converting words to lemma                               
    return newString.strip()                               

In [None]:
df_data['Tweet_clean']=df_data['Tweet_clean'].apply(lambda x: stop_wrds_lemma_convert(x))

In [None]:
#Drop columns that are not required
df_data.drop(columns=['Tweet_ID','Tweet'],axis=0,inplace=True)

In [None]:
df_data.dropna(axis=0,how='any',inplace=True)

In [None]:
plt.figure(figsize=(15,10))

plt.subplot(2,2,1)
all_words=' '.join([text for text in df_data[df_data['Sentiment']=='Positive']['Tweet_clean']])
wordcloud=WordCloud(width=800,height=500,random_state=21,max_font_size=110).generate(all_words)
plt.title('Sentiment: Positive')
plt.imshow(wordcloud)
plt.axis('off')

plt.subplot(2,2,2)
all_words=' '.join([text for text in df_data[df_data['Sentiment']=='Negative']['Tweet_clean']])
wordcloud=WordCloud(width=800,height=500,random_state=21,max_font_size=110).generate(all_words)
plt.title('Sentiment: Negative')
plt.imshow(wordcloud)
plt.axis('off')

plt.subplot(2,2,3)
all_words=' '.join([text for text in df_data[df_data['Sentiment']=='Neutral']['Tweet_clean']])
wordcloud=WordCloud(width=800,height=500,random_state=21,max_font_size=110).generate(all_words)
plt.title('Sentiment: Neutral')
plt.imshow(wordcloud)
plt.axis('off')

plt.subplot(2,2,4)
all_words=' '.join([text for text in df_data[df_data['Sentiment']=='Irrelevant']['Tweet_clean']])
wordcloud=WordCloud(width=800,height=500,random_state=21,max_font_size=110).generate(all_words)
plt.title('Sentiment: Irrelevant')
plt.imshow(wordcloud)
plt.axis('off')

## 3. Model Building

In [None]:
#Seperate dependent and independent features
X=df_data.loc[:,df_data.columns!='Sentiment']
y=df_data['Sentiment']

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=0)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_valid.shape, y_valid.shape)

In [None]:
#TF-IDF
vectorizer = TfidfVectorizer(stop_words='english',ngram_range=(1,3),min_df=10,max_features=10000) 
#Train on train data
features_train= vectorizer.fit_transform(X_train['Tweet_clean'])
#Apply on test data
features_valid= vectorizer.transform(X_valid['Tweet_clean'])
#check shape
features_train.shape, features_valid.shape

In [None]:
#Function to fit and apply a model
def model_apply(model):
    #train the model
    model.fit(features_train,y_train)
    #make predictions
    pred=model.predict(features_valid)
    #model evaluation
    print(model)
    print('Accuracy score: ',accuracy_score(pred,y_valid))
    print('Weighted F1 score: ',f1_score(y_pred=pred,y_true=y_valid,average='weighted'))
    print('Confusion Matrix: \n',confusion_matrix(pred,y_valid))

In [None]:
#Multinomail Naive Bayes
nb=MultinomialNB()
model_apply(nb)

In [None]:
#Logistic Regression
lr=LogisticRegression(random_state=10,max_iter=500)
model_apply(lr)

In [None]:
#Decision Tree
dtc=DecisionTreeClassifier(random_state=10)
model_apply(dtc)

In [None]:
#Random Forest
rf=RandomForestClassifier(random_state=101,n_jobs=-1)
model_apply(rf)