In [188]:
import pandas as pd
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import ast
import json


In [189]:
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jimmyzhou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [190]:
df = pd.read_csv("../data/org-data-unprocessed.csv")

def parse_list_field(x):
    if pd.isna(x):
        return []
    if isinstance(x, list):
        return x
    s = str(x).strip()
    if s == "" or s == "[]":
        return []
    try:
        val = ast.literal_eval(s)
        if isinstance(val, (list, tuple)):
            return list(val)
    except Exception:
        pass
    try:
        val = json.loads(s)
        if isinstance(val, list):
            return val
    except Exception:
        pass
    return [s]

df['Events'] = df['Events'].apply(parse_list_field)
df['News'] = df['News'].apply(parse_list_field)

In [191]:
def safe_literal_eval(data):
    try:
        if isinstance(data, str) and data.startswith('{'):
            return ast.literal_eval(data)
    except (ValueError, SyntaxError):
        return {}
    return data if isinstance(data, dict) else {}

df['Additional Information'] = df['Additional Information'].apply(safe_literal_eval)

In [192]:
def clean_data(text):
    if text is None or pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [193]:

df['Description'] = df['Description'].apply(clean_data)

In [194]:
def clean_event_list(event_list):
    if not isinstance(event_list, list):
        return []
    # preserve order while removing duplicates
    seen = set()
    out = []
    for e in event_list:
        if e is None or pd.isna(e):
            continue
        if e not in seen:
            seen.add(e)
            out.append(clean_data(e))
    return out

df['Events'] = df['Events'].apply(clean_event_list)

In [195]:
df['Document'] = df['Document'].apply(clean_data)

In [196]:
def clean_news(news):
    if not isinstance(news, list):
        return []
    seen = set()
    unique = []
    for item in news:
        if item is None or pd.isna(item):
            continue
        if item not in seen:
            seen.add(item)
            unique.append(item)
    return [clean_data(article) for article in unique]

df['News'] = df['News'].apply(clean_news)

In [197]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Description,Additional Information,Events,Document,News,URL
0,0,100% Triumph,triumph student organization desires improve s...,{'mission_statement': 'The purpose of this org...,[],constitution bylaws triumph last updated artic...,[hello hope great week tomorrow triumph hostin...,https://terplink.umd.edu/organization/salvatio...
1,1,1000 Schools,check much fun students helping honduras shh i...,{'mission_statement': 'To alleviate extreme po...,[come general body meetings learn whats going ...,schools constitution members organization orde...,[],https://terplink.umd.edu/organization/1000schools
2,2,123 I Like to Ski :),join groupme linktree,{'mission_statement': 'The purpose of Ski Club...,[like ski managing trip mont tremblant go skii...,like ski ski club constitution article name se...,[],https://terplink.umd.edu/organization/skiclub
3,3,17 for Peace and Justice,organization seek empower students stakeholder...,{'mission_statement': '17 for Peace and Justic...,[],preamble members peace justice hereby establis...,[],https://terplink.umd.edu/organization/17-for-p...
4,4,32 Bars,bars musical performance troupe dedicated perf...,{'mission_statement': 'The mission of 32 Bars ...,[],constitution last updates article name section...,[],https://terplink.umd.edu/organization/32-bars


In [198]:
def clean_additional_info(info: dict):
    info['mission_statement'] = clean_data(info['mission_statement'])
    info['membership_requirements'] = clean_data(info['membership_requirements'])
    info['how_to_get_involved'] = clean_data(info['how_to_get_involved'])
    info['general_meeting_information'] = clean_data(info['general_meeting_information'])
    return info

In [199]:
df['Additional Information'] = df['Additional Information'].apply(clean_additional_info)

In [200]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Description,Additional Information,Events,Document,News,URL
0,0,100% Triumph,triumph student organization desires improve s...,{'mission_statement': 'purpose organization sh...,[],constitution bylaws triumph last updated artic...,[hello hope great week tomorrow triumph hostin...,https://terplink.umd.edu/organization/salvatio...
1,1,1000 Schools,check much fun students helping honduras shh i...,{'mission_statement': 'alleviate extreme pover...,[come general body meetings learn whats going ...,schools constitution members organization orde...,[],https://terplink.umd.edu/organization/1000schools
2,2,123 I Like to Ski :),join groupme linktree,{'mission_statement': 'purpose ski club bring ...,[like ski managing trip mont tremblant go skii...,like ski ski club constitution article name se...,[],https://terplink.umd.edu/organization/skiclub
3,3,17 for Peace and Justice,organization seek empower students stakeholder...,{'mission_statement': 'peace justice exists le...,[],preamble members peace justice hereby establis...,[],https://terplink.umd.edu/organization/17-for-p...
4,4,32 Bars,bars musical performance troupe dedicated perf...,{'mission_statement': 'mission bars produce st...,[],constitution last updates article name section...,[],https://terplink.umd.edu/organization/32-bars


In [201]:
cleaned_df = pd.DataFrame(columns=['Name', 'URL', 'Time', 'Information'])
for index, row in df.iterrows():
    org_info = ""
    org_info += str(row['Name']) + ' '
    org_info += str(row['Description']) + ' '
    for event in row['Events']:
        org_info += str(event) + ' '
    org_info += str(row['Document']) + ' '
    for article in row['News']:
        org_info += str(article) + ' '
    add_info = row['Additional Information']
    time = ""
    if isinstance(add_info, dict):
        if add_info.get('meeting_schedule') is not None and add_info.get('expected_time_commitment') is not None:
            time = (add_info.get('meeting_schedule') or "") + (add_info.get('expected_time_commitment') or "") + ' '
        org_info += str(add_info.get('general_meeting_information') or "") + ' '
    cleaned_df.loc[len(cleaned_df)] = [row['Name'], row['URL'], time, org_info]


In [203]:
cleaned_df

Unnamed: 0,Name,URL,Time,Information
0,100% Triumph,https://terplink.umd.edu/organization/salvatio...,Evening (6pm-9pm)Low time commitment (1-5 hour...,100% Triumph triumph student organization desi...
1,1000 Schools,https://terplink.umd.edu/organization/1000schools,Morning (9am-12pm)\nEarly Afternoon (12pm-3pm)...,1000 Schools check much fun students helping h...
2,123 I Like to Ski :),https://terplink.umd.edu/organization/skiclub,Weekends (Saturday and Sunday)Low time commitm...,123 I Like to Ski :) join groupme linktree lik...
3,17 for Peace and Justice,https://terplink.umd.edu/organization/17-for-p...,Evening (6pm-9pm)\nWeekdays (Monday-Friday)\nW...,17 for Peace and Justice organization seek emp...
4,32 Bars,https://terplink.umd.edu/organization/32-bars,Late Afternoon (3pm-6pm)\nEvening (6pm-9pm)\nL...,32 Bars bars musical performance troupe dedica...
...,...,...,...,...
960,Youth in Public Health,https://terplink.umd.edu/organization/youthinp...,Late Afternoon (3pm-6pm)\nEvening (6pm-9pm)\nW...,Youth in Public Health youth public health dyn...
961,Zeta Beta Tau - Beta Zeta Epsilon Fraternity,https://terplink.umd.edu/organization/zeta-bet...,Late Afternoon (3pm-6pm)\nEvening (6pm-9pm)\nW...,Zeta Beta Tau - Beta Zeta Epsilon Fraternity f...
962,"Zeta Phi Beta Sorority, Incorporated - Eta Eps...",https://terplink.umd.edu/organization/na,Early Afternoon (12pm-3pm)\nLate Afternoon (3p...,"Zeta Phi Beta Sorority, Incorporated - Eta Eps..."
963,Zeta Psi Fraternity,https://terplink.umd.edu/organization/zeta-psi...,Evening (6pm-9pm)\nLate Night (After 9pm)\nWee...,Zeta Psi Fraternity social fraternity working ...


In [204]:
cleaned_df.to_csv('../data/org-data-cleaned.csv')