# **Sentement Analysis Using Machine Learning**

> ### **Dataset Description**

The sentiment analysis dataset is generated automatically, using emoticons as indicators of sentiment polarity: positive emoticons (:) interpreted as positive tweets, while negative emoticons (:() indicated negative tweets. The dataset includes six fields structured as CSV files:

Polarity represents the tweet's sentiment polarity (0 for negative, 2 for neutral, and 4 for positive),
Tweet ID represents a unique identifier for each tweet,
Tweet Date represents the date and time of the tweet in UTC format,
Query specifies the query term used in the tweet, or 'NO_QUERY' if there is no query,
The username of the Twitter user who posted the tweet,
Text indicates the tweet's actual content, without emoticons.
This dataset is designed to be used by the global data science community for research on sentiment analysis experiments. This allows researchers to explore techniques and algorithms for analysing emotions.

### Import Libraries & DataSet

In [None]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem.wordnet import WordNetLemmatizer
import string
from string import punctuation 
from nltk.tokenize import word_tokenize
from nltk.stem import LancasterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 255)

### Loading the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/train.csv", encoding='latin1')
test = pd.read_csv("/kaggle/input/sentiment-analysis-dataset/test.csv", encoding='latin1')

In [None]:
df.head()

### Concatenate Training and Testing Dataset

In [None]:
df = pd.concat([df,test])

In [None]:
df.head()

In [None]:
df.shape

### Checking the INFO of the Dataset

In [None]:
df.info()

### Dropping the Unnecessory Columns

In [None]:
df.drop(columns=['textID','Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)', "selected_text"], axis=1, inplace=True)

I have Dropped all the unnecessory Columns

In [None]:
df.head()

In [None]:
### Checking the unique values of sentiment column
df['sentiment'].value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

## **Basic Preprocessing**

- Remove tags - HTML
- Lower case
- remove stopwords

In [None]:
df.head(3)

### **remove_tags**

In [None]:
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', str(raw_text))
    return cleaned_text

In [None]:
df['cleaned_text'] = df['text'].apply(remove_tags)

In [None]:
df.head(3)

### Making all the Text in Lower case

In [None]:
df['cleaned_text'] = df['text'].apply(lambda x:str(x).lower())

In [None]:
df.head(3)

In [None]:
def remove_unnecessary_characters(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', str(text))
    text = re.sub(r'\s+', ' ', str(text)).strip()
    return text

In [None]:
df['cleaned_text'] = df['text'].apply(remove_unnecessary_characters)

In [None]:
df.head(3)

### Normalizing the Dataset

In [None]:
def normalize_text(text):
    if isinstance(text, str):
        text = re.sub(r'[^\w\s]', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
    else:
        text = str(text)
    return text
df['cleaned_text'] = df['text'].apply(normalize_text)

In [None]:
df.head(3)

### Removing Stepwords From Dataset

In [None]:
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
sw_list = stopwords.words('english')

df['cleaned_text'] = df['text'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [None]:
df.tail(3)

### Removing URLS

In [None]:
def remove_urls(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    pattern = re.compile(r'http?://\S+|www\.\S+')
    return pattern.sub(r'', text)

In [None]:
df['cleaned_text'] = df['text'].apply(remove_urls)

In [None]:
df.head(3)

In [None]:
exclude = string.punctuation
exclude

def remove_punc1(text):
    return text.translate(str.maketrans('', '', exclude))

In [None]:
df['cleaned_text'] = df['text'].apply(remove_punc1)

In [None]:
df.head()

### Splitting the Dataset

In [None]:
X = df['cleaned_text']
y = df['sentiment']

### Encoding the Categorical Values

In [None]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

### Applying BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [None]:

X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

### USing GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_bow,y_train)     

In [None]:
y_pred = gnb.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
score_lr = accuracy_score(y_test, y_pred)
score_lr

### Using Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(n_jobs=-1)
lr.fit(X_train_bow,y_train)

In [None]:
y_pred = lr.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score,classification_report, ConfusionMatrixDisplay
score_lr = accuracy_score(y_test, y_pred)
score_lr

In [None]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(random_state=0)
rfc.fit(X_train_bow, y_train)

In [None]:
pred_rfc = rfc.predict(X_test_bow)
score_rfc = rfc.score(X_test_bow, y_test)
score_rfc

In [None]:
print(classification_report(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred);

In [None]:
def wp(text):
    return text.upper()

def output_lable(n):
    if n == 0:
        return "The Text Sentement is Negative"
    elif n == 1:
        return "The Text Sentement is Neutral"
    elif n == 2:
        return "The Text Sentement is Positive"
    
def manual_testing(news):
    testing_news = {"text":[news]}
    new_def_test = pd.DataFrame(testing_news)
    new_def_test["text"] = new_def_test["text"].apply(wp) 
    new_x_test = new_def_test["text"]
    new_xv_test = cv.transform(new_x_test)
    pred_lr = lr.predict(new_xv_test)
    pred_rfc = rfc.predict(new_xv_test)

    return print((output_lable(pred_lr)))

In [None]:
text = "I am very Happy "
manual_testing(text)