In [3]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
df = pd.read_csv("Twitter_Data.csv")

# Drop any null values
df.dropna(inplace=True)

# Preprocessing function
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"\@\w+|\#", "", text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

df["clean_text"] = df["clean_text"].astype(str).apply(preprocess_text)

# Splitting dataset
X = df["clean_text"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model training
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PREETHI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.731484322267902
Classification Report:
               precision    recall  f1-score   support

        -1.0       0.88      0.44      0.59      7102
         0.0       0.84      0.66      0.74     11042
         1.0       0.66      0.93      0.77     14450

    accuracy                           0.73     32594
   macro avg       0.79      0.68      0.70     32594
weighted avg       0.77      0.73      0.72     32594



In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

In [None]:
df = pd.read_csv("Twitter_Data.csv")


In [None]:
df.dropna(inplace=True)


In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

In [None]:
def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  
    text = re.sub(r"\@\w+|\#", "", text)  
    text = text.translate(str.maketrans("", "", string.punctuation))  
    text = " ".join([word for word in text.split() if word not in stop_words]) 
    return text

In [None]:
df["clean_text"] = df["clean_text"].astype(str).apply(preprocess_text)


In [None]:
X = df["clean_text"]
y = df["category"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [None]:
y_pred = model.predict(X_test_tfidf)


In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
