# Project Name: Entity-Level Sentiment Analysis

# Requirements

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

# Read dataset

In [2]:
# Read CSV file into DataFrame
path = r"G:\DATA SCIENCE-25\Github\entity_senti_git\dataset\preprocess.csv"
df = pd.read_csv(path)

# Display DataFrame
df.head()

Unnamed: 0,Tweet_ID,Borderlands,Sentiment,Tweet_Content,Words_Per_Tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,10
1,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,10
2,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,10
3,2402,Borderlands,Positive,So I spent a few hours making something for fu...,55
4,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,54


# Simple clean the data for entity recognation

In [3]:
import re

def clean_text(text):
    # Define the pattern to remove unwanted substrings and symbols
    pattern = r"[^a-zA-Z]+|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    
    # Remove unwanted patterns from the text using the defined pattern
    cleaned_text = re.sub(pattern, ' ', text)
    
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    
    
    return cleaned_text 

In [4]:
df['Preprocess_text'] = df['Tweet_Content'].apply(clean_text)

In [5]:
df.head()

Unnamed: 0,Tweet_ID,Borderlands,Sentiment,Tweet_Content,Words_Per_Tweet,Preprocess_text
0,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,10,im getting on borderlands and i will kill you ...
1,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,10,im coming on borderlands and i will murder you...
2,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,10,im getting into borderlands and i can murder y...
3,2402,Borderlands,Positive,So I spent a few hours making something for fu...,55,so i spent a few hours making something for fu...
4,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,54,so i spent a couple of hours doing something f...


Import required modules

!python -m spacy download en_core_web_sm

In [7]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 487.6 kB/s eta 0:00:27
     --------------------------------------- 0.1/12.8 MB 744.7 kB/s eta 0:00:18
     --------------------------------------- 0.1/12.8 MB 847.9 kB/s eta 0:00:15
      -------------------------------------- 0.2/12.8 MB 803.1 kB/s eta 0:00:16
      -------------------------------------- 0.2/12.8 MB 915.1 kB/s eta 0:00:14
      -------------------------------------- 0.3/12.8 MB 944.1 kB/s eta 0:00:14
      -------------------------------------- 0.3/12.8 MB 944.1 kB/s eta 0:00:14
      -------------------------------------- 0.3/12.8 MB 944.1 kB/s eta 0:00:14
      ----------------------------------

In [8]:
import spacy
from spacy.tokens import Span

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Define custom entities
custom_entities = ['Borderlands','Overwatch','Xbox(Xseries)','TomClancysGhostRecon','Dota2', 'CS-GO',
                   'AssassinsCreed','ApexLegends','LeagueOfLegends','Fortnite','Hearthstone','Battlefield',
                   'PlayerUnknownsBattlegrounds','PUBG','CallOfDuty','TomClancysRainbowSix','GrandTheftAuto(GTA)',
                   'Cyberpunk2077']

# Add custom entity ruler to the pipeline
ruler = nlp.add_pipe("entity_ruler")

# Define patterns for custom entities
patterns = [
    {"label": "PERSON", "pattern": [{"LOWER": "borderlands"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "playerunknownsbattlegrounds"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "tomclancysrainbowsix"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "hearthstone"}]},
    {"label": "GAME", "pattern": [{"LOWER": "pubg"}]},
    {"label": "GAME", "pattern": [{"LOWER": "xbox(xseries)"}]},
    {"label": "GAME", "pattern": [{"LOWER": "overwatch"}]},
    {"label": "GAME", "pattern": [{"LOWER": "leagueoflegends"}]},
    {"label": "GAME", "pattern": [{"LOWER": "apexlegends"}]},
    {"label": "GAME", "pattern": [{"LOWER": "callofduty"}]},
    {"label": "GAME", "pattern": [{"LOWER": "battlefield"}]},
    {"label": "ORG", "pattern": [{"LOWER": "dota2"}]},
    {"label": "ORG", "pattern": [{"LOWER": "cs-go"}]},
    {"label": "ORG", "pattern": [{"LOWER": "assassinscreed"}]},
    {"label": "ORG", "pattern": [{"LOWER": "fortnite"}]},
    {"label": "ORG", "pattern": [{"LOWER": "grandtheftauto(gta)"}]},
    {"label": "ORG", "pattern": [{"LOWER": "cyberpunk2077"}]}
]

# Add patterns to the ruler
ruler.add_patterns(patterns)

# Function to perform entity recognition on a text
def entity_recognition(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Evaluation of Entity

In [9]:
# Example usage
text = "Mr. @Harun is playing borderlands and Overwatch on Xbox(Xseries) and TomClancysGhostRecon."
preprocess = clean_text(text)
print(preprocess)
entities = entity_recognition(preprocess)
print(entities)

mr harun is playing borderlands and overwatch on xbox xseries and tomclancysghostrecon 
[('harun', 'PERSON'), ('borderlands', 'PERSON'), ('overwatch', 'GAME')]


In [None]:
#df['Entities'] = df['Preprocess_text'].apply(entity_recognition)

# Save the Entity dataset

In [10]:
df.to_csv(r"G:\DATA SCIENCE-25\Github\entity_senti_git\dataset\entity.csv", index=False)

# Preporcess for Sentiment Analysis

In [13]:
# Import required modules
#python -m spacy download en_core_web_sm

import spacy
import string
from spacy.lang.en.stop_words import STOP_WORDS

# Load the large spaCy model
nlp = spacy.load("en_core_web_sm")

# Access the default stop words from the loaded model
stop_words = nlp.Defaults.stop_words

# Define punctuation
punctuations = string.punctuation

In [14]:
# Creating our tokenizer function
def spacy_token(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [word.lemma_.lower().strip() for word in doc]

    # Removing stop words and punctuation
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    
    # Joining tokens back into a sentence
    processed_sentence = " ".join(mytokens)

    # return preprocessed sentence
    return processed_sentence

In [15]:
# Apply the tokenizer function to the 'text' column and store the result in the 'tokenize' column
df['tokenize'] = df['Preprocess_text'].apply(spacy_token)
df.head()

Unnamed: 0,Tweet_ID,Borderlands,Sentiment,Tweet_Content,Words_Per_Tweet,Preprocess_text,tokenize
0,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,10,im getting on borderlands and i will kill you ...,m borderland kill
1,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,10,im coming on borderlands and i will murder you...,m come borderland murder
2,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,10,im getting into borderlands and i can murder y...,m borderland murder
3,2402,Borderlands,Positive,So I spent a few hours making something for fu...,55,so i spent a few hours making something for fu...,spend hour fun don t know huge borderland fan ...
4,2402,Borderlands,Positive,So I spent a couple of hours doing something f...,54,so i spent a couple of hours doing something f...,spend couple hour fun don t know m huge border...


# Evaluation of spacy_token

In [16]:
# Example usage
text = "Mr. @Harun is playing borderlands and Overwatch on Xbox(Xseries) and TomClancysGhostRecon."
preprocess = clean_text(text)
spacy_text = spacy_token(preprocess)
print(spacy_text)

mr harun play borderland overwatch xbox xserie tomclancysghostrecon


# Save the spacy token text

In [17]:
df = df[df['tokenize'].apply(len) >3]

In [18]:
df = df.reset_index(drop=True)

# Save the spacy token text

In [19]:
df.to_csv(r"G:\DATA SCIENCE-25\Github\entity_sentiment_streamlit\dataset\spacy_token.csv", index=False)

# Text embedding

In [20]:
from sentence_transformers import SentenceTransformer,util
model = SentenceTransformer('all-MiniLM-L6-v2')

In [21]:
# Compute embeddings for the 'Text' column
embeddings = model.encode(df['tokenize'])

In [22]:
df['embeddings'] = embeddings.tolist() 

# Model Evaluation

In [23]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['Sentiment'] = le.fit_transform(df['Sentiment'])

In [24]:
# Split the dataset into features (X) and target (y)
X = df['embeddings'].tolist()
y = df['Sentiment'].tolist()

from sklearn.model_selection import train_test_split

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Logistic Regression

In [25]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

lr.fit(X_train, y_train)

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model on training and test data
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

print("Accuracy on train set:", accuracy_train)
print("Accuracy on test set:", accuracy_test)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(y_test, y_pred_test))

Accuracy on train set: 0.8235202248945807
Accuracy on test set: 0.8193403298350824
Confusion Matrix:
 [[3580  668]
 [ 778 2978]]
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.84      0.83      4248
           1       0.82      0.79      0.80      3756

    accuracy                           0.82      8004
   macro avg       0.82      0.82      0.82      8004
weighted avg       0.82      0.82      0.82      8004



# RandomForestClassifier

In [27]:
from sklearn.ensemble import RandomForestClassifier

# Train a RandomForestClassifier model
rf = RandomForestClassifier()

rf.fit(X_train, y_train)

In [28]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model on training and test data
y_pred_train = rf.predict(X_train)
y_pred_test = rf.predict(X_test)

accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
cm = confusion_matrix(y_test, y_pred_test)

print("Accuracy on train set:", accuracy_train)
print("Accuracy on test set:", accuracy_test)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", classification_report(y_test, y_pred_test))

Accuracy on train set: 0.9963454630641887
Accuracy on test set: 0.9026736631684158
Confusion Matrix:
 [[3948  300]
 [ 479 3277]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      4248
           1       0.92      0.87      0.89      3756

    accuracy                           0.90      8004
   macro avg       0.90      0.90      0.90      8004
weighted avg       0.90      0.90      0.90      8004



# Evaluation

In [29]:
import re

def clean_text(text):
    # Define the pattern to remove unwanted substrings and symbols
    pattern = r"[^a-zA-Z]+|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
    
    # Remove unwanted patterns from the text using the defined pattern
    cleaned_text = re.sub(pattern, ' ', text)
    
    # Convert text to lowercase
    cleaned_text = cleaned_text.lower()
    
    
    return cleaned_text
import spacy
from spacy.tokens import Span

# Load SpaCy English model
nlp = spacy.load("en_core_web_sm")

# Define custom entities
custom_entities = ['Borderlands','Overwatch','Xbox(Xseries)','TomClancysGhostRecon','Dota2', 'CS-GO',
                   'AssassinsCreed','ApexLegends','LeagueOfLegends','Fortnite','Hearthstone','Battlefield',
                   'PlayerUnknownsBattlegrounds','PUBG','CallOfDuty','TomClancysRainbowSix','GrandTheftAuto(GTA)',
                   'Cyberpunk2077']

# Add custom entity ruler to the pipeline
ruler = nlp.add_pipe("entity_ruler")

# Define patterns for custom entities
patterns = [
    {"label": "PERSON", "pattern": [{"LOWER": "borderlands"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "playerunknownsbattlegrounds"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "tomclancysrainbowsix"}]},
    {"label": "PERSON", "pattern": [{"LOWER": "hearthstone"}]},
    {"label": "GAME", "pattern": [{"LOWER": "pubg"}]},
    {"label": "GAME", "pattern": [{"LOWER": "xbox(xseries)"}]},
    {"label": "GAME", "pattern": [{"LOWER": "overwatch"}]},
    {"label": "GAME", "pattern": [{"LOWER": "leagueoflegends"}]},
    {"label": "GAME", "pattern": [{"LOWER": "apexlegends"}]},
    {"label": "GAME", "pattern": [{"LOWER": "callofduty"}]},
    {"label": "GAME", "pattern": [{"LOWER": "battlefield"}]},
    {"label": "ORG", "pattern": [{"LOWER": "dota2"}]},
    {"label": "ORG", "pattern": [{"LOWER": "cs-go"}]},
    {"label": "ORG", "pattern": [{"LOWER": "assassinscreed"}]},
    {"label": "ORG", "pattern": [{"LOWER": "fortnite"}]},
    {"label": "ORG", "pattern": [{"LOWER": "grandtheftauto(gta)"}]},
    {"label": "ORG", "pattern": [{"LOWER": "cyberpunk2077"}]}
]

# Add patterns to the ruler
ruler.add_patterns(patterns)

# Function to perform entity recognition on a text
def entity_recognition(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities 

text = "Mr. @Harun is playing borderlands and Overwatch on Xbox(Xseries) and TomClancysGhostRecon."
preprocess = clean_text(text)

entities = entity_recognition(preprocess)
print(entities)

spacy_text = spacy_token(preprocess)


embeddings = model.encode(spacy_text)


# Predict sentiment using the trained model
predicted_sentiment = rf.predict(embeddings.reshape(1, -1))

# Map predicted sentiment label to human-readable format
sentiment_label = "Positive" if predicted_sentiment == 0 else "Negative"

print(f"Predicted Sentiment: {sentiment_label}")

[('harun', 'PERSON'), ('borderlands', 'PERSON'), ('overwatch', 'GAME')]
Predicted Sentiment: Negative


In [None]:
# Deployment

In [30]:
import pickle

# Save the trained Random Forest model to a file
with open(r'G:\DATA SCIENCE-25\Github\entity_senti_git\model\rf.pkl', 'wb') as model_file:
    pickle.dump(rf, model_file)