<a href="https://colab.research.google.com/github/MohibShaikh/Twitter-Sentiment-Analysis/blob/main/Sentiment_Analysis_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
print(stopwords.words('german'))

### Data Preprocessing

In [None]:
columns = ['target','id','date','flag','user','text']
# Loading Data
twt_data = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv',names=columns,
                       encoding='ISO-8859-1')

In [None]:
twt_data.head()

In [None]:
twt_data.shape

In [None]:
twt_data.describe()

In [None]:
twt_data.tail()

In [None]:
twt_data.isnull().sum()

In [None]:
twt_data['target'].value_counts()
twt_data.replace({'target':{4:1}},inplace=True)
twt_data

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):
  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [None]:
twt_data['stemmed_content'] = twt_data['text'].apply(stemming)

In [None]:
twt_data

In [None]:
X = twt_data['stemmed_content'].values
Y = twt_data['target'].values

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y,random_state=2)

In [None]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
model = LogisticRegression(max_iter=1000, random_state=2).fit(X_train,Y_train)

In [None]:
model_2 = GradientBoostingClassifier(random_state=2).fit(X_train,Y_train)

## Logistic Regression

In [None]:
X_train_pred = model.predict(X_train)
training_data_acc = accuracy_score(Y_train, X_train_pred)
training_data_cm = confusion_matrix(Y_train, X_train_pred)
training_data_ps = precision_score(Y_train, X_train_pred)
training_data_rs = recall_score(Y_train, X_train_pred)

In [None]:
print(f'Accuracy Score (LogisticRegression): {training_data_acc}')
print(f'Confusion Matrix (LogisticRegression): {training_data_cf}')
print(f'Precision Score (LogisticRegression): {training_data_ps}')
print(f'Recall Score (LogisticRegression): {training_data_rs}')

## Gradient Boosting Classifier

In [None]:
X_train_pred2 = model_2.predict(X_train)
training_data_acc2 = accuracy_score(Y_train, X_train_pred2)
training_data_cm2 = confusion_matrix(Y_train, X_train_pred2)
training_data_ps2 = precision_score(Y_train, X_train_pred2)
training_data_rs2 = recall_score(Y_train, X_train_pred2)

In [None]:
print(f'Accuracy Score (GradientBoostingClassifier): {training_data_acc2}')
print(f'Confusion Matrix (GradientBoostingClassifier): {training_data_cf2}')
print(f'Precision Score (GradientBoostingClassifier): {training_data_ps2}')
print(f'Recall Score (GradientBoostingClassifier): {training_data_rs2}')