#22BCE097- Arhaan Godhrawala

In [None]:
import pandas as pd
import json
import spacy
from transformers import pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

#Vader for Sentiment analysis
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

# Load spaCy model for preprocessing text
nlp = spacy.load("en_core_web_sm")

# Load dataset efficiently
def load_data(file_path):
    data = []
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            json_obj = json.loads(line.strip())
            data.append(json_obj["data"])
    return pd.DataFrame(data)

# Load data.jsonl file here
df = load_data("data.jsonl")

# Retain just the relevant columns
relevant_columns = ["id", "subreddit", "title", "selftext", "author", "created_utc", "num_comments", "ups", "score", "permalink", "url"]
df = df[relevant_columns]
df["created_utc"] = pd.to_datetime(df["created_utc"], unit="s")

# Drop duplicates and handle missing values
df.drop_duplicates(subset=["id"], inplace=True)
df.fillna("", inplace=True)

# Sentiment Analysis using VADER
def compute_sentiment(text):
    return sia.polarity_scores(text)["compound"]
df["sentiment_score"] = df["selftext"].apply(compute_sentiment)

# Emotion Analysis
def classify_emotion(score):
    if score > 0.3:
        return "Positive"
    elif score < -0.3:
        return "Negative"
    else:
        return "Neutral"
df["emotion_category"] = df["sentiment_score"].apply(classify_emotion)

# Misinformation Classification using MNLI Model
fact_checker = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
def classify_misinformation(text):
    candidate_labels = ["Reliable", "Unreliable", "Can't Say"]
    result = fact_checker(text, candidate_labels)
    return result['labels'][0]  # Return the most likely label
df['misinformation_label'] = df['title'].apply(classify_misinformation)

# Named Entity Recognition (NER) with spaCy
def extract_named_entities(text):
    doc = nlp(text)
    entities = [ent.text for ent in doc.ents if ent.label_ not in ["CARDINAL", "ORDINAL", "DATE", "TIME"]]
    entities = [ent for ent in entities if ent.lower() not in ["the", "what", "this", "is", "a", "it"]]
    return entities
df["named_entities"] = df["selftext"].apply(extract_named_entities)

# Hashtag extraction
def extract_hashtags(text):
    return [word for word in text.split() if word.startswith("#")]
df["hashtags"] = df["selftext"].apply(extract_hashtags)

# Subreddit Total Activity Metrics that includes number of upvotes, comments etc
df["total_activity"] = df["num_comments"] + df["ups"]
subreddit_activity = df.groupby("subreddit").agg({"id": "count", "num_comments": "sum", "ups": "sum", "total_activity": "sum"}).reset_index()

# Top Users Analysis
top_users = df["author"].value_counts().reset_index()
top_users.columns = ["author", "post_count"]

# Topic Modeling with LDA (using just 5 topics)
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf_vectorizer.fit_transform(df["title"] + " " + df["selftext"])
lda_model = LatentDirichletAllocation(n_components=5, random_state=42)
lda_topics = lda_model.fit_transform(X_tfidf)
df["dominant_topic"] = lda_topics.argmax(axis=1)

# Final dataframe adjustments
df.to_csv("final_df.csv", index=False)
print("Final dataset shape:", df.shape)


Final dataset shape: (8799, 18)


In [None]:
final_df=pd.read_csv("final_df.csv")

In [None]:
final_df

Unnamed: 0,id,subreddit,title,selftext,author,created_utc,num_comments,ups,score,permalink,url,sentiment_score,emotion_category,misinformation_label,named_entities,hashtags,total_activity,dominant_topic
0,1is5wgo,Anarchism,What Are You Reading/Book Club Tuesday,"What you are reading, watching, or listening ...",AutoModerator,2025-02-18 06:01:00,1,2,2,/r/Anarchism/comments/1is5wgo/what_are_you_rea...,https://www.reddit.com/r/Anarchism/comments/1i...,0.0000,Neutral,Reliable,"[What, Are, You, Reading/Book, Club, Tuesday]",[],3,3
1,1irrceg,Anarchism,"""WTF is Social Ecology?"" by Usufruct Collective",,NewMunicipalAgenda,2025-02-17 18:47:05,2,48,48,/r/Anarchism/comments/1irrceg/wtf_is_social_ec...,https://usufructcollective.wordpress.com/2025/...,0.0000,Neutral,Can't Say,"[Social, Ecology?"", Usufruct, Collective]",[],50,2
2,1ir8tnp,Anarchism,Who do you think is the most powerful/popular ...,I am an anarcho-nihilist and i am reading simi...,Charming-Score7015,2025-02-17 01:57:39,2,3,3,/r/Anarchism/comments/1ir8tnp/who_do_you_think...,https://www.reddit.com/r/Anarchism/comments/1i...,0.6297,Positive,Can't Say,[Who],[],5,4
3,1irq9vp,Anarchism,Not paying student loans or taxes,"Ayo, im an anarchist but perhaps not the most ...",NoBackground7266,2025-02-17 18:05:32,20,33,33,/r/Anarchism/comments/1irq9vp/not_paying_stude...,https://www.reddit.com/r/Anarchism/comments/1i...,0.7368,Positive,Unreliable,[Not],[],53,4
4,1irojku,Anarchism,Recommendations for intellectual debate content,Looking for videos/podcasts/writing where folk...,Jewstun,2025-02-17 16:57:44,1,7,7,/r/Anarchism/comments/1irojku/recommendations_...,https://www.reddit.com/r/Anarchism/comments/1i...,0.9392,Positive,Reliable,[Recommendations],[],8,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8794,1gq6mvv,worldpolitics,The emperor protects,,Appropriate-Humor-33,2024-11-13 06:22:37,1,67,67,/r/worldpolitics/comments/1gq6mvv/the_emperor_...,https://i.redd.it/zl9qj7qw3m0e1.jpeg,0.0000,Neutral,Reliable,[The],[],68,0
8795,1gq6k66,worldpolitics,That's just what I was looking for,,[deleted],2024-11-13 06:17:36,5,17,17,/r/worldpolitics/comments/1gq6k66/thats_just_w...,https://i.redd.it/9l6i7of03m0e1.png,0.0000,Neutral,Reliable,[I],[],22,4
8796,1gq47q0,worldpolitics,You fucks want some Tacos de Birria or what?,They were amazing. My friend in my grad progra...,davy89irox,2024-11-13 03:56:29,5,42,42,/r/worldpolitics/comments/1gq47q0/you_fucks_wa...,https://i.redd.it/mvvtk35udl0e1.jpeg,0.8805,Positive,Can't Say,"[You, Tacos, Birria]",[],47,4
8797,1gq2vcc,worldpolitics,What are your builds in Elden Ring? Tell me wh...,,ShutUpRedditor44,2024-11-13 02:45:40,7,4,4,/r/worldpolitics/comments/1gq2vcc/what_are_you...,https://i.redd.it/a3sm70k41l0e1.png,0.0000,Neutral,Unreliable,"[What, Elden, Ring?, Tell]",[],11,2


In [None]:
#Applying TF-IDF for vectors to normalize
feature_names = tfidf_vectorizer.get_feature_names_out()
num_words = 15

#Printing the 5 topics which have 15 words highly related to one another to form a topic
for topic_idx, topic in enumerate(lda_model.components_):
    top_words = [feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_words)}")


Topic 0: day, art, tifa, posting, lockhart, party, 2025, aid, women, amp, democratic, red, radical, mutual, germany
Topic 1: trump, musk, doge, says, elon, federal, administration, tariffs, president, house, government, biden, ukraine, canada, donald
Topic 2: trump, people, just, time, like, democrats, right, don, election, think, media, republicans, going, harris, say
Topic 3: https, com, www, video, org, new, trump, jimmy, china, watch, peertube, europe, amp, big, putin
Topic 4: like, just, know, anarchist, ve, don, good, people, fucks, want, free, vance, think, really, socialist


In [None]:
final_df['dominant_topic'].value_counts()

Unnamed: 0_level_0,count
dominant_topic,Unnamed: 1_level_1
1,2560
2,2346
4,1806
0,1234
3,853


In [None]:
final_df['hashtags'].value_counts()

Unnamed: 0_level_0,count
hashtags,Unnamed: 1_level_1
[],8746
"[##, ##, ##]",15
"[#, #]",4
[#],4
[#1],3
[#adaywithoutimmigrants],1
[#jokingnotjokingwink],1
"[#1., #2., #3.]",1
"[#, #, #, #]",1
[##If],1
