In [None]:
import re
import nltk
import string
import pandas as pd
import emoji
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from contractions import fix
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Load dataset (Replace with your actual path if using local file)
df = pd.read_excel("ML.xlsx")

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Convert to lowercase
    text = text.lower()

    # Expand contractions (e.g., "can't" → "cannot")
    text = fix(text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)

    # Remove special characters & punctuation (excluding emojis)
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    words = word_tokenize(text)

    # Remove stopwords and apply lemmatization
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

    # Join words back into a single string
    return ' '.join(words)

# Apply cleaning function to 'Post_Text' column
df['Cleaned_Post_Text'] = df['Post_Text'].apply(clean_text)

# Remove '#' from Hashtags column and extract hashtags separately
df['Hashtags'] = df['Hashtags'].str.replace('#', '')

# Convert emojis into their text meaning
df['Post_Text_Emojis_Converted'] = df['Post_Text'].apply(lambda x: emoji.demojize(x, delimiters=(" ", " ")))

# Encode Sentiments
df['Sentiment_Encoded'] = df['Sentiment'].map({'Positive': 1, 'Negative': -1, 'Neutral': 0})

# Feature Engineering: Word Count and Emoji Count
df['Word_Count'] = df['Cleaned_Post_Text'].apply(lambda x: len(x.split()))
df['Emoji_Count'] = df['Post_Text'].apply(lambda x: sum(1 for char in x if char in emoji.EMOJI_DATA))

# Analyze Device_Type and Location impact on Sentiment
device_sentiment = df.groupby('Device_Type')['Sentiment_Encoded'].mean()
location_sentiment = df.groupby('Location')['Sentiment_Encoded'].mean()

# Tokenization & Vectorization
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
df_tfidf = tfidf_vectorizer.fit_transform(df['Cleaned_Post_Text']).toarray()

# Get feature names (words corresponding to TF-IDF values)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Convert TF-IDF matrix into a DataFrame
df_tfidf_result = pd.DataFrame(df_tfidf, columns=feature_names)

# Add original post IDs for reference (if available)
df_tfidf_result.insert(0, 'Post_ID', df.index)


df_tfidf_result.to_csv("TFIDF_Output.csv", index=False)
print("TF-IDF results saved as 'TFIDF_Output.csv'.")

df.to_csv("Cleaned_ML.csv", index=False)
print("Preprocessing complete. Cleaned data saved as 'Cleaned_ML.csv'.")

# Display impact analysis results
print("Sentiment impact by Device_Type:\n", device_sentiment)
print("\nSentiment impact by Location:\n", location_sentiment)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


TF-IDF results saved as 'TFIDF_Output.csv'.
Preprocessing complete. Cleaned data saved as 'Cleaned_ML.csv'.
Sentiment impact by Device_Type:
 Device_Type
Desktop   -0.017236
Mobile     0.021005
Tablet    -0.009977
Name: Sentiment_Encoded, dtype: float64

Sentiment impact by Location:
 Location
Berlin, Germany            0.008114
Cape Town, South Africa    0.028112
London, UK                -0.074427
Mumbai, India              0.005181
New York, USA             -0.009045
Paris, France              0.047813
Sao Paulo, Brazil         -0.014521
Sydney, Australia          0.055165
Tokyo, Japan              -0.073602
Toronto, Canada            0.012270
Name: Sentiment_Encoded, dtype: float64
