# 1. Load data and necessary libraries

In [8]:
import pandas as pd
import re
import nltk
import time

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from textstat import flesch_kincaid_grade

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ismathakit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ismathakit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ismathakit/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/ismathakit/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [39]:
train_df = pd.read_csv("/Users/ismathakit/Downloads/trainingandtestdata/train.csv", encoding_errors='ignore')
test_df = pd.read_csv("/Users/ismathakit/Downloads/trainingandtestdata/test.csv", encoding_errors='ignore')

**Explore dataset separately to test and train.**

*Train dataset*

In [40]:
train_df = train_df.iloc[:,2:]
train_df.columns = ['Date', 'Type', 'Username', 'Text']
train_df.head()

Unnamed: 0,Date,Type,Username,Text
0,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [41]:
train_df.shape

(1599999, 4)

*Test dataset*

The first two columns doesn't tell any significances for the information. So, I'd like to drop those of according to the train header.

In [44]:
# New test data
test_df = test_df.iloc[:,2:]
test_df.columns = ['Date', 'Type', 'Username', 'Text']
test_df.head()

Unnamed: 0,Date,Type,Username,Text
0,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...
1,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck..."
2,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...
3,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...
4,Mon May 11 03:22:00 UTC 2009,kindle2,GeorgeVHulme,@richardebaker no. it is too big. I'm quite ha...


In [45]:
test_df.shape

(497, 4)

# 2. Data Preprocessing
Remove URLs, mentions, hashtags, numbers, and special characters using regex.

In [31]:
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [54]:
def clean_text(text):
    """Clean text by removing URLs, mentions, hashtags, numbers, and special characters."""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r"@\w+|#\w+", "", text)  # Remove mentions and hashtags
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    text = text.strip()  # Strip leading/trailing whitespaces
    return text

def tokenize_text(text):
    """Tokenize text into words."""
    return word_tokenize(text)

def remove_stopwords(tokens):
    """Remove stopwords from tokenized text."""
    return [word for word in tokens if word.lower() not in stop_words]

def stem_text(tokens):
    """Apply stemming to tokens."""
    return [stemmer.stem(word) for word in tokens]

def lemmatize_text(tokens):
    """Apply lemmatization to tokens."""
    return [lemmatizer.lemmatize(word) for word in tokens]
'''
def preprocess_data(df, text_column):
    """Preprocess the text data in the specified column of a dataframe."""
    df["Cleaned_Text"] = df[text_column].apply(clean_text)
    df["Tokens"] = df["Cleaned_Text"].apply(tokenize_text)
    df["Tokens"] = df["Tokens"].apply(remove_stopwords)
    df["Stemmed_Tokens"] = df["Tokens"].apply(stem_text)
    df["Lemmatized_Tokens"] = df["Tokens"].apply(lemmatize_text)
    df["Processed_Text"] = df["Lemmatized_Tokens"].apply(lambda x: " ".join(x))
'''
def preprocess_data(df, text_column):
    """Preprocess the text data and add a 'Processed_Text' column to the dataframe."""
    import re
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    from nltk.stem import WordNetLemmatizer
    from nltk.tokenize import word_tokenize
    
    # Ensure stopwords are available
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    def clean_text(text):
        """Clean individual text input."""
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        # Remove special characters and numbers
        text = re.sub(r'[^A-Za-z\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        # Stemming and Lemmatization
        tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
        # Rejoin tokens
        return ' '.join(tokens)
    
    # Apply cleaning function to the text column
    df['Processed_Text'] = df[text_column].apply(clean_text)
    return df

# 3. Pre-process text

In [55]:
start_time = time.time()

train_df = preprocess_data(train_df, text_column='Text')  # Replace 'Text' with the correct column name in train.csv
test_df = preprocess_data(test_df, text_column='Text')  # Replace 'Text' with the correct column name in test.csv

end_time = time.time()

In [59]:
# Check if 'Processed_Text' was added successfully
print(train_df[['Text', 'Processed_Text']].head())
print(test_df[['Text', 'Processed_Text']].head())

                                                Text  \
0  is upset that he can't update his Facebook by ...   
1  @Kenichan I dived many times for the ball. Man...   
2    my whole body feels itchy and like its on fire    
3  @nationwideclass no, it's not behaving at all....   
4                      @Kwesidei not the whole crew    

                                      Processed_Text  
0  upset cant updat facebook text might cri resul...  
1  kenichan dive mani time ball manag save rest g...  
2                    whole bodi feel itchi like fire  
3              nationwideclass behav im mad cant see  
4                                kwesidei whole crew  
                                                Text  \
0  Reading my kindle2...  Love it... Lee childs i...   
1  Ok, first assesment of the #kindle2 ...it fuck...   
2  @kenburbary You'll love your Kindle2. I've had...   
3  @mikefish  Fair enough. But i have the Kindle2...   
4  @richardebaker no. it is too big. I'm quite ha... 

Yes, it worked :)

# 4. Compare basic statistics before and after cleaning

In [60]:
def calculate_statistics(text_series):
    """Calculate basic text statistics: average sentence length, word count, vocabulary size."""
    total_words = sum(text_series.apply(lambda x: len(x.split())))
    unique_words = len(set(" ".join(text_series).split()))
    average_sentence_length = total_words / len(text_series)
    return total_words, unique_words, average_sentence_length

In [61]:
# Before cleaning (Train and Test)
original_train_stats = calculate_statistics(train_df["Text"])
original_test_stats = calculate_statistics(test_df["Text"])

# After cleaning (Train and Test)
cleaned_train_stats = calculate_statistics(train_df["Processed_Text"])
cleaned_test_stats = calculate_statistics(test_df["Processed_Text"])

# 5. Evaluate text quality

In [62]:
def calculate_lexical_diversity(text_series):
    """Calculate lexical diversity (ratio of unique words to total words)."""
    total_words = sum(text_series.apply(lambda x: len(x.split())))
    unique_words = len(set(" ".join(text_series).split()))
    return unique_words / total_words

In [63]:
train_df["Readability_Score"] = train_df["Processed_Text"].apply(flesch_kincaid_grade)
test_df["Readability_Score"] = test_df["Processed_Text"].apply(flesch_kincaid_grade)

train_lexical_diversity = calculate_lexical_diversity(train_df["Processed_Text"])
test_lexical_diversity = calculate_lexical_diversity(test_df["Processed_Text"])

# 6. Output results

In [70]:
runtime = end_time - start_time

print("--- Train Dataset Cleaning Statistics ---")
print(f"- Number of documents: {len(train_df['Text'])} → {len(train_df['Processed_Text'])}")
print(f"- Average tokens per document: {original_train_stats[2]} → {cleaned_train_stats[2]}")
print(f"- Total vocabulary size: {original_train_stats[1]} → {cleaned_train_stats[1]}")
print(f"- Lexical diversity: {train_lexical_diversity:.2f}")
print(f"- Average readability score (Flesch-Kincaid): {train_df['Readability_Score'].mean():.2f}")

--- Train Dataset Cleaning Statistics ---
- Number of documents: 1599999 → 1599999
- Average tokens per document: 13.175500109687569 → 7.670459169036981
- Total vocabulary size: 1350157 → 672984
- Lexical diversity: 0.05
- Average readability score (Flesch-Kincaid): 4.30


In [69]:
print("\n--- Test Dataset Cleaning Statistics ---")
print(f"- Number of documents: {len(test_df['Text'])} → {len(test_df['Processed_Text'])}")
print(f"- Average tokens per document: {original_test_stats[2]} → {cleaned_test_stats[2]}")
print(f"- Total vocabulary size: {original_test_stats[1]} → {cleaned_test_stats[1]}")
print(f"- Lexical diversity: {test_lexical_diversity:.2f}")
print(f"- Average readability score (Flesch-Kincaid): {test_df['Readability_Score'].mean():.2f}")


--- Test Dataset Cleaning Statistics ---
- Number of documents: 497 → 497
- Average tokens per document: 13.541247484909457 → 8.227364185110664
- Total vocabulary size: 3150 → 1776
- Lexical diversity: 0.43
- Average readability score (Flesch-Kincaid): 4.75


In [68]:
print(f"\n--- Total Runtime ---")
print(f"- Total runtime: {runtime:.2f} seconds")


--- Total Runtime ---
- Total runtime: 169.54 seconds
