## Importing Required libraries

In [None]:
# Analysing data
import pandas as pd
import numpy as np

# Visualizating the data
from matplotlib import pyplot as plt
import seaborn as sns

# Processing the data
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import string

# Machine Learning part
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedStratifiedKFold, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, plot_confusion_matrix, roc_auc_score, roc_curve,auc
from sklearn.metrics import confusion_matrix, classification_report, log_loss, precision_score
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import FunctionTransformer


## Loading datasets

In [None]:
train_data = pd.read_csv("datasets/tweet_analysis_train.csv",encoding = "utf-8",engine = "python",header = 0)

test_data = pd.read_csv("datasets/tweet_analysis_test.csv",encoding = "utf-8",engine = "python",header = 0)

### Getting info about the datasets

In [None]:
print("Train set shape : ", train_data.shape)
print("Test set shape : ", test_data.shape)
print(train_data.info())
#train_data

## Preparing the dataset

In [None]:
## Dropping the column 'id'
train_data = train_data.drop('id', axis = 'columns')

## Checkinf for duplicates
print("Total duplicates in train set : ", train_data.duplicated().sum())

#### Since the train data set contains duplicate we will remove that

In [None]:
train_data = train_data.drop_duplicates(keep = 'first')
print("Total duplicates in train set : ", train_data.duplicated().sum())

In [None]:
## Checking for the Null
train_data.isnull().any()

### Now check for the count of 1 and 0 in the label and also the percentage of them 

In [None]:
train_data.groupby('label').count()

In [None]:
Zeroes = 100 * len(train_data.loc[train_data['label'] == 0,'label']) / len(train_data['label'])
Ones = 100 * len(train_data.loc[train_data['label'] == 1,'label']) / len(train_data['label'])
print("Percentage of Zeroes(Positive) Sentiment tweets is : ", Zeroes)
print("Percentage of Ones(Negative) Sentiment tweets is : ", Ones)
print("\nAs we can see, The training dataset is very much Imbalanced")

### Now we will get 20 most frequent words in the train dataset

In [None]:
Count = CountVectorizer(stop_words='english')
words = Count.fit_transform(train_data.tweet)
Sum = words.sum(axis = 0)
List = [(words, Sum[0,j]) for words, j in Count.vocabulary_.items()]
List = sorted(List, key= lambda x : x[1], reverse = True)
Freq = pd.DataFrame(List, columns = ['word', 'frequency'])
print(Freq)

In [None]:
## Plotting a histogram to show it graphically
Freq.head(10).plot(x = 'word', y = 'frequency', kind = 'bar', figsize = (10, 7), color = 'red')
plt.show()

### Now we will define functions to calculate , words, characters, Hashtags, Numbers, stopwords etc.

In [None]:
## Words count
def Count_words(dataframe):
    dataframe['words'] = dataframe['tweet'].apply(lambda x : len(str(x).split(" ")))
    
## Characters Count
def Count_chars(dataframe):
    dataframe['characters'] = dataframe['tweet'].apply(lambda x : sum(list(map(len, x.split(' ')))))
    
## Hastags count
def Count_hashtags(dataframe):
    dataframe['hashtags'] = dataframe['tweet'].apply(lambda x : len([x for x in x.split() if x.startswith('#')]))
    
## stopwords count
def Count_stopwords(dataframe):
    stopword = stopwords.words('english')
    dataframe['stopwords'] = dataframe['tweet'].apply(lambda x : len([x for x in x.split() if x in stopword]))
    
## Numbers count
def Count_words(dataframe):
    dataframe['numbers'] = dataframe['tweet'].apply(lambda x : len([x for x in x.split() if x.isdigit()]))
    
    
def Calculate(dataframe):
    Count_words(dataframe)
    Count_chars(dataframe)
    Count_hashtags(dataframe)
    Count_stopwords(dataframe)
    Count_words(dataframe)

Calculate(train_data)
Calculate(test_data)
train_data

### Now cleaning and processing the dataset

In [None]:
stopword = stopwords.words('english')
stemm = nltk.SnowballStemmer('english')

def Delete(word):
    word = str(word).lower()
    word = re.sub('\[.*?\]', '', word)
    word = re.sub('https?://\S+|www\.\S+', '', word)
    word = re.sub('<.*?>+', '', word)
    word = re.sub('[%s]' % re.escape(string.punctuation), '', word)
    word = re.sub('\n', '', word)
    word = re.sub('\w*\d\w*', '', word)
    word = [text for text in word.split(' ') if text not in stopword]
    word =" ".join(word)
    word = [stemm.stem(text) for text in word.split(' ')]
    word =" ".join(word)
    return word

train_data['tweet'] = train_data['tweet'].apply(Delete)
test_data['tweet'] = test_data['tweet'].apply(Delete)
##train_data
##Process(test_data)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(train_data['tweet'], train_data['label'], test_size = 0.20, shuffle = True, random_state = 11)
print(train_x)

### Now there is the need to vectorize the tweets, as this is method in Natural Language Processing 

In [None]:
## Using TfidVectorizer
vector = TfidfVectorizer(stop_words='english')

X_train_vector = vector.fit_transform(train_x)
X_test_vector = vector.transform(test_x)

print(X_train_vector.shape, X_test_vector.shape)

In [None]:
## Analyzing the Imbalance of the the dataset
plt.pie(train_y.value_counts(), labels=['Label 0 (Positive Tweets)', 'Label 1 (Negative Tweets)'], autopct='%0.1f%%')
plt.axis('equal')
plt.show()

### Since the data is highly imblanced thus oversampling is needed to balance ths dataset 

In [None]:
## Using SMOTE Technique oversampling is done
smote = SMOTE()
X_train_sm, y_train_sm = smote.fit_resample(X_train_vector, train_y.values)
print(X_train_sm.shape, y_train_sm.shape)

In [None]:
## Thus Now the dataset is balanced
plt.pie(pd.value_counts(y_train_sm), labels=['Label 0 (Positive Tweets)', 'Label 1 (Negative Tweets)'], autopct='%0.1f%%')
plt.axis('equal')
plt.show()

In [None]:
## Function to Train the model
def Training(y_actual, y_predicted):
    ## Accuracy
    accuracy = round(accuracy_score(y_actual, y_predicted), 3)
    ## F1 score
    f1 = round(f1_score(y_actual, y_predicted), 3)
    
    print(f'Training Scores: Accuracy={accuracy}, F1-Score={f1}')
    
## Function to Validate the model
def validation(y_actual, y_predicted):
    ## Accuracy
    accuracy = round(accuracy_score(y_actual, y_predicted), 3)
    ## F1 score
    f1 = round(f1_score(y_actual, y_predicted), 3)
    
    print(f'Validation Scores: Accuracy={accuracy}, F1-Score={f1}')

In [None]:
## Using LinerRegression model
Model1 = LogisticRegression()
Model1.fit(X_train_sm, y_train_sm)

y_train_pred = Model1.predict(X_train_sm)
y_test_pred = Model1.predict(X_test_vector)

Training(y_train_sm, y_train_pred)
validation(test_y, y_test_pred)


In [None]:
## Using MultinomialNB Model
Model2 = MultinomialNB()
Model2.fit(X_train_sm, y_train_sm)

y_train_pred = Model2.predict(X_train_sm)
y_test_pred = Model2.predict(X_test_vector)

Training(y_train_sm, y_train_pred)
validation(test_y, y_test_pred)

In [None]:
## Using RandomForestClassifier model
Model3 = RandomForestClassifier()
Model3.fit(X_train_sm, y_train_sm)

y_train_pred = Model3.predict(X_train_sm)
y_test_pred = Model3.predict(X_test_vector)

Training(y_train_sm, y_train_pred)
validation(test_y, y_test_pred)

### Out of the Models tried, The RandomForestClassifier give the most accurate Result hence we will persist with this model

In [None]:
Model = RandomForestClassifier(criterion='entropy', max_samples=0.8, min_samples_split=10, random_state=0)
Model.fit(X_train_sm, y_train_sm)

y_train_pred = Model.predict(X_train_sm)
y_test_pred = Model.predict(X_test_vector)

Training(y_train_sm, y_train_pred)
validation(test_y, y_test_pred)

### Saving the Model using joblib

In [None]:
from joblib import dump, load
dump(Model, 'Tweet_analysis.joblib')

### we need to load to use the model

In [None]:
model = load('Tweet_analysis.joblib')
Tweet = "i Love India"
Tweet = Delete(Tweet)
Tweet = vector.transform([Tweet])
print(Model.predict(Tweet))