In [None]:
import numpy as np
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
onj = SentimentIntensityAnalyzer()
from textblob import TextBlob
import re
import chardet
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
subreddit = [
    "gameofthrones",
    "aww",
    "gaming",
    "news",
    "politics",
    "dankmemes",
    "relationship_advice",
    "nba",
    "worldnews",
    "AskReddit",
    "AmItheAsshole",
    "SquaredCircle",
    "The_Donald",
    "leagueoflegends",
    "hockey",
    "videos",
    "teenagers",
    "gonewild",
    "movies",
    "funny",
    "pics",
    "marvelstudios",
    "memes",
    "soccer",
    "freefolk",
    "MortalKombat",
    "todayilearned",
    "apexlegends",
    "asoiaf",
    "Market76",
    "Animemes",
    "FortNiteBR",
    "nfl",
    "trashy",
    "unpopularopinion",
    "ChapoTrapHouse",
    "RoastMe",
    "Showerthoughts",
    "wallstreetbets",
    "Pikab",
]

In [None]:
subreddit_dict = {subreddit[i]:i for i in range(len(subreddit))}

# Sentence

In [None]:
def detect_encoding(file_path):
    with open(file_path, 'rb') as file:
        result = chardet.detect(file.read())
    return result['encoding']

file_path = 'pre-processed-data.csv'
detected_encoding = detect_encoding(file_path)

df = pd.read_csv(file_path, encoding=detected_encoding)

In [None]:
df1 = df.copy()

In [None]:
df = df1.drop(columns=['score'])

In [None]:
df_data = {'Subjectivity':list(),'Polarity':[],'Neg':[],'Pos':[],'Compound':[],'Complexity':[],'Class':[]}

In [None]:
def count_syllables(word):
    word = word.lower().strip()
    vowel_sounds = re.findall(r'[aeiouy]+', word)
    syllables = len(vowel_sounds)
    if word.endswith('e'):
        syllables -= 1
    if word.endswith('y') and not re.match(r'[aeiouy]+y$', word):
        syllables += 1
    return syllables

def flesch_kincaid_grade_level(text):
    words = text.split()
    sentences = re.split(r'[.?!]+', text.strip())

    avg_syllables = sum(count_syllables(word) for word in words) / len(words)
    avg_words_per_sentence = len(words) / len(sentences)

    fkgl = 0.39 * avg_words_per_sentence + 11.8 * avg_syllables - 15.59
    return round(fkgl, 2)

In [None]:

for i in range(df.shape[0]):
    text = str(df.loc[i]['body'])
    blob = TextBlob(text)
    df_data['Subjectivity'].append(blob.subjectivity)
    df_data['Polarity'].append(blob.sentiment.polarity)
    polarity_scores = onj.polarity_scores(text)
    df_data['Neg'].append(polarity_scores['neg'])
    df_data['Pos'].append(polarity_scores['pos'])
    df_data['Compound'].append(polarity_scores['compound'])
    fkgl = flesch_kincaid_grade_level(text)
    df_data['Complexity'].append(fkgl)
    df_data['Class'].append(df.loc[i][0])

In [None]:
df_data_ = pd.DataFrame(df_data)

In [None]:
df_data_.to_csv('Mid_data.csv',index = False)

## Training

In [None]:
data = pd.read_csv("Mid_data.csv")

In [None]:
features = data.iloc[:,:6]
labels = data.iloc[:,-1]

In [None]:
class_medians = features.groupby(labels).mean()

In [None]:
def return_params(text):
    blob = TextBlob(text)
    polarity_scores = onj.polarity_scores(text)
    fkgl = flesch_kincaid_grade_level(text)
    return [blob.subjectivity,blob.sentiment.polarity,polarity_scores['neg'],polarity_scores['pos'],polarity_scores['compound'],fkgl]

In [None]:
def cosine_similarity(data_point1, data_point2):
    dot_product = np.dot(data_point1, data_point2)
    norm1 = np.linalg.norm(data_point1)
    norm2 = np.linalg.norm(data_point2)
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [None]:
test_text = str(input("Enter the post whose subreddit you want to find: "))
test_data_point = return_params(test_text)

In [None]:
similarity = {}
for i in range(class_medians.shape[0]):
    similarity[i] = cosine_similarity(test_data_point,list(class_medians.iloc[i]))

In [None]:
top3_elements = sorted(similarity.items(), key=lambda x: x[1], reverse=True)[:10]

# Word parameters

## Ishaan's code

In [None]:
token_dict={}

with open("pre-processed-data_`.csv", "r", encoding="utf8", errors="ignore") as f:

    k=f.readline()

    while(True):

        k=f.readline()
        
        if len(k)==0:
            break

        _,x,k=k.split(",",2)

        k=re.sub('[\d|\_]', '', k)

        token_dict[x]=k

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

feature_names = tfidf.get_feature_names_out()

dense=tfs.todense()
denselist = dense.tolist()
df = pd.DataFrame(denselist, columns=feature_names, index= list(token_dict.keys()))

In [None]:
subreddit_lists = {i:subreddit[i] for i in range(len(subreddit))}

In [None]:
inputsubs =[subreddit_lists[top3_elements[i][0]] for i in range(len(top3_elements))]

impwords={}

for  subreddit in inputsubs:
    
    s = pd.Series(df.loc[subreddit])
    impwords[subreddit]=s[s > 0.0001].sort_values(ascending=False)[:100].keys().tolist()

In [24]:
def count_matching_words(words, text):
  count = 0
  for word in text.lower().split():
    if word in words:
      count += 1
  return count

In [28]:
most_prob_sub = 0
count = 0
for i in impwords:
    if count<count_matching_words(impwords[i],test_text):
        count = count_matching_words(impwords[i],test_text)
        most_prob_sub = i

In [29]:
print(most_prob_sub)

'teenagers'

# You dont need to add the next part, it is for further extensions

In [None]:
import pandas as pd
from textblob import TextBlob
import re


def analyze_text(text):
    """
    Analyzes a text and returns a dictionary containing values for 6 dimensions:
        - joy
        - anger
        - complexity (average number of syllables per word)
    """
    # Create a TextBlob object
    blob = TextBlob(text)

    # Calculate emotion scores (range: 0-1)
    joy = blob.sentiment.polarity
    anger = blob.sentiment.subjectivity

    # Calculate word complexity (average syllables per word)
    syllables = sum(count_syllables(word) for word in text.split())
    word_count = len(text.split())
    complexity = syllables / word_count if word_count else 0


    # Return dictionary with analysis results
    return {
        "joy": joy,
        "anger": anger,
        "complexity": complexity,
    }


def count_syllables(word):
    # Remove punctuation and convert to lowercase
    word = word.lower().strip()

    # Count vowel sounds
    vowel_sounds = re.findall(r"[aeiouy]+", word)

    # Count syllables (assume consonant sounds between vowel sounds)
    syllables = len(vowel_sounds)

    # Special cases for silent "e" and "y"
    if word.endswith("e"):
        syllables -= 1
    if word.endswith("y") and not re.match(r"[aeiouy]+y$", word):
        syllables += 1

    return syllables


def flesch_kincaid_grade_level(text):

    words = text.split()
    sentences = re.split(r"[.?!]+", text.strip())

    avg_syllables = sum(count_syllables(word) for word in words) / len(words)
    avg_words_per_sentence = len(words) / len(sentences)

    fkgl = 0.39 * avg_words_per_sentence + 11.8 * avg_syllables - 15.59

    return round(fkgl, 2)

text = "Supercalifragilisticexpialidocious"
analysis_data = analyze_text(text)

print(analysis_data)
