In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import download

# Download resources for lemmatization
download('wordnet')      # WordNet lemmatizer
download('omw-1.4')      # Open Multilingual WordNet
download('stopwords')    # Stopwords

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package wordnet to /home/jmaharja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jmaharja/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jmaharja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
def preprocess(df):
    def preprocess_text(text):
        # 1. Convert to lowercase
        text = text.lower()
        
        # 2. Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        
        # 3. Remove mentions (@username)
        text = re.sub(r'@\w+', '', text)
        
        # 4. Remove hashtags (keep the word or remove completely)
        text = re.sub(r'#\w+', '', text)  # To keep the word, use: text = re.sub(r'#', '', text)
        
        # 5. Remove special characters, numbers, and punctuation
        # text = re.sub(r'[^a-z\s]', '', text)  # To keep numbers, modify to: r'[^a-z0-9\s]'
        text = re.sub(r'[^a-z0-9\s]', '', text)  # To keep numbers, modify to: r'[^a-z0-9\s]'

        
        # 6. Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
            
        # 8. Tokenize words and lemmatize
        tokens = text.split()
        # tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        text = " ".join(tokens)
        
        return text

    df = df.loc[:, ~df.columns.str.contains('^Unnamed:')]
    print(df.shape)
    df['cleaned_status'] = df['STATUS'].apply(preprocess_text)
    df.rename(columns={'STATUS': 'original', 'cleaned_status': 'STATUS'}, inplace=True)
    # Convert to strings and drop NaN values
    df['STATUS'] = df['STATUS'].astype(str).fillna('')
    df = df[df['STATUS'].str.strip() != '']
    print(df.shape)
    filtered_df = df[df['STATUS'].str.split().str.len() >= 3]
    print(filtered_df.shape)
    return filtered_df

In [22]:
df1 = pd.read_csv('data/mypersonality_processed_data_nov_27.csv')
df2 = pd.read_csv('data/pandora_processed_train.csv')
df1 = preprocess(df1)
df2 = preprocess(df2)
merged_df = pd.concat([df1, df2], ignore_index=True)
merged_df.to_csv("data/all_cleaned_train_data_nov_27.csv")
merged_df.shape

(9917, 139)
(9891, 140)
(9339, 140)
(46636, 139)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_status'] = df['STATUS'].apply(preprocess_text)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns={'STATUS': 'original', 'cleaned_status': 'STATUS'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['STATUS'] = df['STATUS'].astype(str).fillna('')


(46634, 140)
(45491, 140)


(54830, 140)

In [21]:
merged_df.sample(10)

Unnamed: 0,original,cEXT,cNEU,cAGR,cCON,cOPN,Segment,WC,Analytic,Clout,...,disgust,fear,joy,negative,positive,sadness,surprise,trust,sent_score,STATUS
46917,"We throw tortillas on the field, hate everyone...",1,0,0,0,1,1,59,43.72,50.15,...,2,6,2,4,4,3,2,2,0.5912,we throw tortilla on the field hate everyone h...
1235,"~ hahaha, no school.",0,1,0,1,1,1,3,10.19,1.0,...,0,0,0,0,0,0,0,1,0.34,hahaha no school
23194,"Fastest? Well, get a spoon and just start eating.",0,0,0,1,1,1,9,33.38,,...,0,0,0,0,0,0,0,0,0.2732,fastest well get a spoon and just start eating
35342,Is that a song? But yes I absolutely do. Jus...,1,0,0,0,1,1,22,6.68,1.08,...,0,2,0,1,0,1,0,0,0.1901,is that a song but yes i absolutely do just do...
50794,That is a falcon. Hawks don't hunt like that.,0,1,1,0,1,1,9,10.19,2.75,...,0,0,0,0,0,0,0,0,-0.2755,that is a falcon hawk dont hunt like that
4019,wishes everyone a Happy New Year!,0,0,1,1,0,1,6,89.52,98.75,...,0,0,1,0,1,0,0,1,0.68,wish everyone a happy new year
41010,"Nah, I'd like to spend some more time with the...",0,1,0,0,0,1,17,98.34,2.18,...,0,0,0,0,0,0,0,1,0.2732,nah id like to spend some more time with the s...
13868,any spot that you can get to without cheats sh...,0,1,0,0,0,1,49,8.47,28.8,...,0,0,1,1,1,1,0,2,0.8142,any spot that you can get to without cheat sho...
29568,So they're bribing the sports agencies? Plaus...,1,0,0,0,1,1,7,2.35,97.11,...,0,0,0,0,0,0,0,0,0.0,so theyre bribing the sport agency plausible
28119,"Depends through what medium, the topic, if the...",0,0,0,1,1,1,43,98.95,27.38,...,0,0,0,0,2,0,0,1,0.25,depends through what medium the topic if the p...


In [14]:
# train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
# train_data.to_csv('/data/jmharja/projects/PersonaClassifier/data/pandora_processed_train_nov26.csv', index=False)
# # val_data.to_csv('val_data_nov26.csv', index=False)
# test_data.to_csv('/data/jmharja/projects/PersonaClassifier/data/pandora_processed_test_nov26.csv', index=False)

# print("Data successfully split and saved!")

Data successfully split and saved!
