Sentiment Analysis

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [7]:
# Step 2: Load the Dataset
data = pd.read_csv("sentiment_analysis.csv")

In [8]:
# Step 3: Check the dataset structure and the first few rows
print("Dataset Columns:\n", data.columns)
print("\nFirst 5 Rows:\n", data.head())

Dataset Columns:
 Index(['Year', 'Month', 'Day', 'Time of Tweet', 'text', 'sentiment',
       'Platform'],
      dtype='object')

First 5 Rows:
    Year  Month  Day Time of Tweet  \
0  2018      8   18       morning   
1  2018      8   18          noon   
2  2017      8   18         night   
3  2022      6    8       morning   
4  2022      6    8          noon   

                                                text sentiment     Platform  
0              What a great day!!! Looks like dream.  positive    Twitter    
1     I feel sorry, I miss you here in the sea beach  positive    Facebook   
2                                     Don't angry me  negative     Facebook  
3  We attend in the class just for listening teac...  negative    Facebook   
4                  Those who want to go, let them go  negative   Instagram   


In [12]:
# Step 4: Preprocess the Data
# We will use the 'text' column for input features and 'sentiment' column as the target
X = data['text']  # Text data
y = data['sentiment']  # Sentiment labels

# Optional: Handle missing values or other preprocessing steps as needed
X = X.dropna()  # Remove missing text entries
y = y[X.index]  # Align 'y' with 'X' after dropping missing rows


In [15]:
# Step 5: Convert text to numerical features using TF-IDF Vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer  # Import the TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)  # Limit to 5000 features
X_vec = vectorizer.fit_transform(X)


In [17]:
# Step 6: Split the Data into Training and Testing Sets
from sklearn.model_selection import train_test_split  # Import the function

X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.3, random_state=42)


In [19]:
# Step 7: Train the Model (Logistic Regression)
from sklearn.linear_model import LogisticRegression  # Import the model

classifier = LogisticRegression(max_iter=1000)  # Increase max_iter if convergence issues
classifier.fit(X_train, y_train)


In [20]:
# Step 8: Make Predictions
y_pred = classifier.predict(X_test)

In [22]:
from sklearn.metrics import accuracy_score, classification_report  # Import evaluation metrics

# Step 9: Evaluate the Model
print("\nAccuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy Score: 0.66

Classification Report:
               precision    recall  f1-score   support

    negative       1.00      0.27      0.43        48
     neutral       0.49      1.00      0.66        48
    positive       0.97      0.70      0.82        54

    accuracy                           0.66       150
   macro avg       0.82      0.66      0.63       150
weighted avg       0.83      0.66      0.64       150

