### Import Data

In [4]:
import json
def load_data(file):
    with open (file, "r", encoding="utf-8") as f:
        data = json.load(f) 
    return (data)

corpus = load_data("local_data.json")["File"]
# corpus[0]

## Keyword Extraction

### Named entity Recognition

#### Import Libraries

In [None]:
import json
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk 
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


#### Key Word extractor 

In [None]:

# Define a function to extract keywords
def extract_keywords(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stopwords
    stopwords_list = stopwords.words('english')
    tokens = [token for token in tokens if token not in stopwords_list]

    # Tag the tokens with their part of speech
    tagged_tokens = pos_tag(tokens)

    # Define a function to convert part of speech tags to WordNet compatible tags
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    # Lemmatize the tokens using WordNet
    lemmatizer = wordnet.WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(tag)) for token, tag in tagged_tokens]

    # Identify named entities using NLTK's named entity recognition (NER) module
    named_entities = ne_chunk(pos_tag(word_tokenize(text)))

    # Extract the most common lemmas and named entities
    keywords = [token for token, count in Counter(lemmatized_tokens + [chunk[0] for chunk in named_entities if hasattr(chunk, 'label')]).most_common(10)]

    return keywords

#### Implementaion

In [None]:
#download using nltk.download('punkt') if you get an nltk error
all_keywords = []
for index, story in enumerate(corpus):
    keywords = extract_keywords(story[index])
    all_keywords.append(keywords)

### TF-IDF

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import nltk
nltk.download('stopwords')

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Get the stop words for English
spacy_stop_words = nlp.Defaults.stop_words

# choose the set of english stopwords
nltk_stop_words = set(stopwords.words('english'))

# Function to preprocess the input corpus
def preprocess_text(text):
    # tokenize a story within a corpus 
    tokens = word_tokenize(text.lower())

    # remove alphanumeric characters and stop words
    cleaned_tokens = [token for token in tokens if token.isalpha() and token not in spacy_stop_words]
    return cleaned_tokens

cleaned_corpus = [preprocess_text(doc) for doc in corpus]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
# vectorize using TfidfVectorizer which combines counting and noralized weighting
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

feature_names = vectorizer.get_feature_names_out()

In [None]:
import numpy as np

def get_top_keywords(tfidf_scores, feature_names, top_n):
    sorted_scores = np.argsort(tfidf_scores)[::-1]
    top_keywords = [feature_names[i] for i in sorted_scores[:top_n]]
    return top_keywords

for i, doc in enumerate(corpus):
    tfidf_scores = tfidf_matrix[i].toarray()[0]
    top_keywords = get_top_keywords(tfidf_scores, feature_names, 10)
    print(f"Document {i+1} top keywords: {top_keywords}")


##### Experimental 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the keywords and phrases that are likely to appear in relevant stories
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the path to the file containing the stories
file_path = 'path/to/file.csv'

# Read in the stories from the file
with open(file_path, 'r', encoding='utf-8') as csv_file:
    csv_reader = csv.DictReader(csv_file)
    stories = [row['story'] for row in csv_reader]

# Define a vectorizer that will convert the stories into a matrix of TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)

# Convert the stories into a matrix of TF-IDF features
story_matrix = vectorizer.fit_transform(stories)

# Loop through the stories and calculate the cosine similarity between each story and the relevant keywords
for i, story in enumerate(stories):
    # Convert the story into a matrix of TF-IDF features
    story_vec = vectorizer.transform([story])

    # Calculate the cosine similarity between the story and the relevant keywords
    similarity = cosine_similarity(story_vec, story_matrix[:, [vectorizer.vocabulary_.get(word) for word in relevant_keywords]])

    # If the similarity is above a certain threshold, print the story
   


### Rake_NLTK

**Use "rake_nltk.txt" to install all the necessary packages** (```pip install -r requirements.txt```).

In [None]:
from rake_nltk import Rake

r = Rake(min_length = 1)

r.extract_keywords_from_text(corpus[0])
r.get_ranked_phrases_with_scores()

### KeyBERT & KeyphraseVectorizers

**Use "KeyBert_req.txt" to install all the necessary packages** (```pip install -r requirements.txt```).
However, before you install anything, make sure to fulfill the requirements below:
* Make sure you installed the following [cuda(especially nvcc)](https://nvidia.github.io/cuda-python/install.html), [spacy](https://spacy.io/usage#quickstart), [visual c++ >2017 and the windows SDK for C++](https://visualstudio.microsoft.com/visual-cpp-build-tools/). The links above should lead you to the installation instruction of each of these libraries in case the pip install of the requirements doesn't work. Visual c++ and the Windows SDK for C++ needs to be installed manually. <u><span style="background-color: #f70000">**Make sure to use Python <= 3.9.**</span><u>
* <u>**I highly highly suggest the use of a virtual environment when installing all the python libraries and dependencies.**<u>
    
      
    

In [5]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT

#init model
kw_model = KeyBERT(model='all-MiniLM-L6-v2')

In [6]:
#using an ngram_range
# keywords_1 = kw_model.extract_keywords(
#         docs=corpus, 
#         # vectorizer = KeyphraseCountVectorizer(), #passing vectorizer in, don't use keyphrase_ngram_range
#         keyphrase_ngram_range= (1,3),
#         stop_words ='english', 
#         use_maxsum=True, 
#         use_mmr=True,
#         nr_candidates = 20,
#         top_n = 20, 
#         diversity=0.5
#         )

keywords_2 = kw_model.extract_keywords(
        docs=corpus, 
        vectorizer = KeyphraseCountVectorizer(spacy_pipeline='en_core_web_sm', workers=5), #passing vectorizer in, don't use keyphrase_ngram_range
        use_maxsum=True, 
        use_mmr=True,
        nr_candidates = 20,
        top_n = 10,
        diversity=0.5
        )

In [11]:
keywords_2[5]

[('community college', 0.3562),
 ('affirmative action type candidate', 0.356),
 ('government programs', 0.2948),
 ('more job counseling', 0.2814),
 ('poverty', 0.2588),
 ('accounting', 0.1879),
 ('more it classes', 0.1594),
 ('house insecure', 0.1322),
 ('midwest', 0.1315),
 ('way', 0.0915)]

In [8]:
corpus[0]

"My mum had me at 15 years. No idea who my dad is. I grew up with a single mum who would spend every last dollar on meth or coke. To say we were poor was an understatement. No amount of government assistance can get through to you if your mother is an addict. We moved around a lot, I went to 17 different schools growing up, having no food was a common occurrence. I've been homeless for periods of time as a kid. I've had to wash myself in public restrooms and from time to time I was sent to other 'relatives' to live. I was sexually abused on multiple occasions, and I've kept all of this to myself all these years. When you're a kid it's terrifying to speak out. You already live in a shaky, unstable world so uprooting the last foundation you have, even if it's a drug addled mother is unthinkable. Anyway, fast-forward. I tried really hard in school. I mean really hard. It was the only way I could see myself getting out of the hole I was in. My mum dropped out of school at 14 and all I knew

## Data Prep

### Clustering

In [None]:
import os
import glob
import json
# import zstandard as zstd
import pandas as pd
import numpy as np

In [None]:
# compressed_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions"
# temp_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\temp\RS_2005-06.zst"
decompressed_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\RS_2018-06.json"
output_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\chunks"
# listing = glob.glob(temp_path + '\\*.zst')

In [None]:
chunk_size=50000
batch_no=1 

for chunk in pd.read_json(decompressed_path,lines=True,chunksize=chunk_size):
    chunk.to_csv(os.path.join(output_path, str(batch_no)+'.csv'),index=False)
    batch_no+=1

In [None]:
### Clustering
import os
import glob
import json
# import zstandard as zstd
import pandas as pd
import numpy as np
# compressed_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions"
# temp_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\temp\RS_2005-06.zst"
decompressed_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\RS_2018-06.json"
output_path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\chunks"
# listing = glob.glob(temp_path + '\\*.zst')
chunk_size=50000
batch_no=1 

for chunk in pd.read_json(decompressed_path,lines=True,chunksize=chunk_size):
    chunk.to_csv(os.path.join(output_path, str(batch_no)+'.csv'),index=False)
    batch_no+=1
path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\chunks\1.csv"
df = pd.read_csv(path)
arr = np.array(["college", "AskReddit", "science", "psychology", "socialwork", "personalfinance", "financialaid"])
count = (df['subreddit'].isin(arr)).sum()

counts = df['subreddit'].value_counts()[arr]
print(counts)
df.columns

In [None]:
path = r"C:\Users\MoRevolution\Desktop\College\Data Dump\Reddit Sumbission Dump\reddit\submissions\Decompressed\chunks\1.csv"
df = pd.read_csv(path)
arr = np.array(["college", "AskReddit", "science", "psychology", "socialwork", "personalfinance", "financialaid"])
count = (df['subreddit'].isin(arr)).sum()


In [None]:
counts = df['subreddit'].value_counts()[arr]
print(counts)

In [None]:
df.columns

## Filtering

### Heuristic Filtering

In [None]:

g_story = []
# Define the keywords and phrases 
relevant_keywords = ['low socio-economic background', 'higher education', 'difficulties', 'overcome']

# Define the keywords and phrases that are likely to appear in irrelevant stories
irrelevant_keywords = ['high-achieving', 'wealthy', 'privileged']


for story in corpus: 
    # Check if the story contains any of the relevant keywords
    relevant_count = sum(1 for keyword in relevant_keywords if keyword in story.lower())

    # Check if the story contains any of the irrelevant keywords
    irrelevant_count = sum(1 for keyword in irrelevant_keywords if keyword in story.lower())

    # If the story is relevant, print it
    if relevant_count > 0 and irrelevant_count == 0:
        g_story.append(story)


### Text Classification

In [None]:
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score



In [None]:
labeled_data = corpus

# Split the data into training and testing sets
train_data = [data['text'] for data in labeled_data if data['label'] == 'relevant']
train_labels = [data['label'] for data in labeled_data if data['label'] == 'relevant']
test_data = [data['text'] for data in labeled_data if data['label'] != 'relevant']
test_labels = [data['label'] for data in labeled_data if data['label'] != 'relevant']

# Convert the text data into a bag-of-words representation
vectorizer = CountVectorizer(stop_words='english')
train_data_counts = vectorizer.fit_transform(train_data)
test_data_counts = vectorizer.transform(test_data)

In [None]:
# Train a Naive Bayes classifier on the labeled data
clf = MultinomialNB()
clf.fit(train_data_counts, train_labels)

# Evaluate the classifier on the test data
predictions = clf.predict(test_data_counts)
accuracy = accuracy_score(test_labels, predictions)
print(f"Accuracy: {accuracy}")

# Filter the relevant stories from the corpus
with open('corpus.json', 'r') as f:
    corpus = json.load(f)

relevant_stories = []
for story in corpus:
    story_counts = vectorizer.transform([story['text']])
    if clf.predict(story_counts) == 'relevant':
        relevant_stories.append(story)

print(f"Found {len(relevant_stories)} relevant stories.")
