<a href="https://colab.research.google.com/github/Hyndavnath2583/Sentiment-Analysis-Logistic-Regression/blob/main/SentimentAnalysis_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer

# Step 1: Upload your dataset (Assume the dataset is in CSV format)
from google.colab import files
uploaded = files.upload()  # This will allow you to upload a file

# Load the dataset into a pandas DataFrame
df = pd.read_csv(next(iter(uploaded)))

# Step 2: Check the columns in the dataset
print("Columns in the dataset:", df.columns)

# Step 3: Ensure the 'Sentiment' column exists and handle case sensitivity
sentiment_column = 'Sentiment' if 'Sentiment' in df.columns else 'sentiment'  # If lowercase exists, use it
text_column = 'Text' if 'Text' in df.columns else 'text'  # Handle case sensitivity for 'Text'

# Ensure 'Sentiment' column exists
if sentiment_column not in df.columns:
    print(f"The column '{sentiment_column}' is not in the dataset. Available columns:", df.columns)
    exit()  # Exit if the column is not found

# Step 4: Clean and process the 'Sentiment' column
df[sentiment_column] = df[sentiment_column].str.strip().str.lower()

# Check the unique values in the 'Sentiment' column
print(f"Unique values in '{sentiment_column}' column:", df[sentiment_column].unique())

# Step 5: Map sentiment labels to numerical values
df[sentiment_column] = df[sentiment_column].map({'positive': 1, 'negative': 0, 'neutral': 2})

# Check if any missing values exist after mapping
print("Missing values after mapping:", df[sentiment_column].isnull().sum())

# Step 6: Preprocessing function for text data
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\S+', '', text)    # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    return text

# Ensure 'Text' column exists, otherwise print the columns and exit
if text_column not in df.columns:
    print(f"The column '{text_column}' is not in the dataset. Available columns:", df.columns)
    exit()  # Exit if the column is not found

# Apply preprocessing to the 'Text' column
df['Cleaned_Text'] = df[text_column].apply(preprocess_text)

# Step 7: Vectorize the text using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Cleaned_Text'])

# Step 8: Define target variable and feature matrix
y = df[sentiment_column]  # Sentiment is the target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Increase max_iter to avoid convergence warnings
model.fit(X_train, y_train)

# Step 10: Predictions and evaluation
y_pred = model.predict(X_test)

# Display accuracy and other evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Saving sentiment_analysis.csv to sentiment_analysis (11).csv
Columns in the dataset: Index(['Year', 'Month', 'Day', 'Time of Tweet', 'text', 'sentiment',
       'Platform'],
      dtype='object')
Unique values in 'sentiment' column: ['positive' 'negative' 'neutral']
Missing values after mapping: 0
Accuracy: 0.67
Confusion Matrix:
[[18  1 17]
 [ 4 23  7]
 [ 0  4 26]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.50      0.62        36
           1       0.82      0.68      0.74        34
           2       0.52      0.87      0.65        30

    accuracy                           0.67       100
   macro avg       0.72      0.68      0.67       100
weighted avg       0.73      0.67      0.67       100

