In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import download

# Download resources for lemmatization
download('wordnet')      # WordNet lemmatizer
download('omw-1.4')      # Open Multilingual WordNet
download('stopwords')    # Stopwords

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package wordnet to /home/jmaharja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jmaharja/nltk_data...
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jmaharja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
file_path = '/data/jmharja/projects/PersonaClassifier/data/pandora_processed_data_nov_12.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)
data = data.loc[:, ~data.columns.str.contains('^Unnamed:')]
data.head(1)

Data successfully split and saved!


In [12]:
def preprocess_text(text):
    # 1. Convert to lowercase
    text = text.lower()
    
    # 2. Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 3. Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    
    # 4. Remove hashtags (keep the word or remove completely)
    text = re.sub(r'#\w+', '', text)  # To keep the word, use: text = re.sub(r'#', '', text)
    
    # 5. Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)  # To keep numbers, modify to: r'[^a-z0-9\s]'
    
    # 6. Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
        
    # 8. Tokenize words and lemmatize
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    text = " ".join(tokens)
    
    return text

# Apply preprocessing to the column
data['cleaned_status'] = data['STATUS'].apply(preprocess_text)

data.head(2)


Unnamed: 0,STATUS,cAGR,cOPN,cCON,cEXT,cNEU,Segment,WC,Analytic,Clout,...,disgust,fear,joy,negative,positive,sadness,surprise,trust,sent_score,cleaned_status
0,Those stats come from the test. [Echoing the c...,0,1,0,0,1,1,152,72.8,40.06,...,0,1,2,2,12,0,2,6,0.3919,stats come test echoing comment made related q...
1,"That's great to hear! I hope you know that, de...",0,1,0,0,1,1,321,24.04,73.9,...,3,6,7,14,22,11,6,9,0.7646,thats great hear hope know despite harsh word ...


In [14]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data.to_csv('/data/jmharja/projects/PersonaClassifier/data/pandora_processed_train_nov26.csv', index=False)
# val_data.to_csv('val_data_nov26.csv', index=False)
test_data.to_csv('/data/jmharja/projects/PersonaClassifier/data/pandora_processed_test_nov26.csv', index=False)

print("Data successfully split and saved!")

Data successfully split and saved!
