In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
# Run this once to download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/meshach/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/meshach/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /Users/meshach/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
df = pd.read_csv('./finalData/final_data_cleaned.csv', encoding='latin1')

In [6]:
# Drop rows with null in important fields
df.dropna(subset=['Title', 'Body', 'Tags'], inplace=True)

In [8]:
# Combine title and body into one column
df['Text'] = df['Title'].fillna('') + ' ' + df['Body'].fillna('')

In [11]:
# Init NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Modified preprocess function without removing code-like terms
def preprocess(text):
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

# Apply
df['clean_text'] = df['Text'].apply(preprocess)

In [12]:
# Save
df[['clean_text', 'Tags']].to_csv('final_preprocessed.csv', index=False)