In [6]:
import json
import re
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset
def load_data(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

# Clean and preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    return text

# Tokenize, remove stopwords, and lemmatize
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    words = [lemmatizer.lemmatize(word) for word in words]  # Lemmatization
    
    return " ".join(words)

# Sentiment analysis
def analyze_sentiment(text):
    return TextBlob(text).sentiment.polarity  # Returns score between -1 (negative) and 1 (positive)

# Process dataset
def process_dataset(data):
    processed_data = []
    for record in data:
        cleaned_text = clean_text(record["text"])
        processed_text = preprocess_text(cleaned_text)
        sentiment_score = analyze_sentiment(processed_text)

        processed_data.append({
            "title": record["title"],
            "cleaned_text": processed_text,
            "score": record["score"],
            "num_comments": record["num_comments"],
            "created_utc": record["created_utc"],
            "location": record["location"],
            "category": record["category"],
            "sentiment_score": sentiment_score
        })

    return processed_data

# Save results
def save_results(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)

# Main execution
if __name__ == "__main__":
    input_filename = "cleaned_data.json"  # Replace with actual file path
    output_filename = "processed_dataset.json"

    raw_data = load_data(input_filename)
    processed_data = process_dataset(raw_data)
    save_results(processed_data, output_filename)

    print(f"Processing complete. Results saved to {output_filename}.")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Komal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Processing complete. Results saved to processed_dataset.json.


In [2]:
pip install pandas numpy nltk scikit-learn






[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
pip install textblob


Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ------------------------------ --------- 471.0/624.3 kB 9.8 MB/s eta 0:00:01
   ---------------------------------- ----- 532.5/624.3 kB 6.6 MB/s eta 0:00:01
   ---------------------------------------- 624.3/624.3 kB 5.6 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
pip install pandas transformers emoji matplotlib wordcloud prophet seaborn


Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
   ---------------------------------------- 0.0/590.6 kB ? eta -:--:--
   ---------------------- ---------------- 348.2/590.6 kB 10.9 MB/s eta 0:00:01
   ------------------------------------ --- 532.5/590.6 kB 6.7 MB/s eta 0:00:01
   ---------------------------------------- 590.6/590.6 kB 6.2 MB/s eta 0:00:00
Installing collected packages: emoji
Successfully installed emoji-2.14.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.1.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [9]:
import json
import re
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
from wordcloud import WordCloud
from datetime import datetime
from transformers import pipeline
from prophet import Prophet

# Load Data
with open("cleaned_data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Convert JSON to DataFrame
df = pd.DataFrame(data)

# Convert timestamp to datetime
df["created_utc"] = pd.to_datetime(df["created_utc"], format="%Y-%m-%d %H:%M:%S")

# Function to extract hashtags
def extract_hashtags(text):
    return re.findall(r"#\w+", text)

df["hashtags"] = df["text"].apply(extract_hashtags)

# Function to extract emojis
def extract_emojis(text):
    return [c for c in text if c in emoji.EMOJI_DATA]

df["emojis"] = df["text"].apply(extract_emojis)

# Sentiment Analysis
sentiment_model = pipeline("sentiment-analysis")
df["sentiment"] = df["text"].apply(lambda x: sentiment_model(x)[0]["label"])

# Trend Analysis using Prophet
trend_df = df[["created_utc", "score"]].rename(columns={"created_utc": "ds", "score": "y"})
model = Prophet()
model.fit(trend_df)

# Future prediction
future = model.make_future_dataframe(periods=30)  # Predict next 30 days
forecast = model.predict(future)

# Visualization
plt.figure(figsize=(10, 5))
sns.countplot(data=df, x="sentiment", palette="coolwarm")
plt.title("Sentiment Distribution")
plt.show()

# Hashtag Analysis
all_hashtags = [hashtag for hashtags in df["hashtags"] for hashtag in hashtags]
hashtag_freq = pd.Series(all_hashtags).value_counts().head(20)

plt.figure(figsize=(12, 6))
sns.barplot(x=hashtag_freq.index, y=hashtag_freq.values, palette="viridis")
plt.xticks(rotation=45)
plt.title("Top Hashtags")
plt.show()

# Generate Word Cloud
text_data = " ".join(df["text"])
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text_data)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Social Media Posts")
plt.show()

# Prophet Trend Plot
model.plot(forecast)
plt.title("Trend Analysis using Prophet")
plt.show()



No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

RuntimeError: Failed to import transformers.models.distilbert.modeling_tf_distilbert because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [11]:
pip uninstall keras


^C
Note: you may need to restart the kernel to use updated packages.
