In [12]:
# Import necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Download required NLTK data files
import nltk
nltk.download('punkt')  # Tokenizer models
nltk.download('stopwords')  # Stopword list
nltk.download('wordnet')  # Lemmatizer resources


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [13]:
nltk.download('all')  # This downloads all NLTK data files (optional)


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [5]:
# Load the dataset
file_path = r'/content/transcripts.csv'  # Path to your dataset
data = pd.read_csv(file_path)


In [6]:
# Ensure the 'transcript' column exists
if 'transcript' not in data.columns:
    raise ValueError("The dataset must have a 'transcript' column.")
data = data[['transcript']]  # Keep only the relevant column


In [7]:
# Step 1: Handle Missing Data
def handle_missing_data(data):
    data = data.dropna(subset=['transcript']).reset_index(drop=True)
    return data

data = handle_missing_data(data)

In [8]:
# Step 2: Normalize Text
def normalize_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.strip()  # Remove extra whitespace
    return text

In [10]:
# Step 3: Remove Noise
def remove_noise(text):
    text = re.sub(r'\[.*?\]', '', text)  # Remove speaker labels
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters and digits
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    return text

data['transcript'] = data['transcript'].apply(remove_noise)

In [14]:
# Step 4: Remove Stopwords
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Function to remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return ' '.join(word for word in words if word not in stop_words)

# Apply to the 'transcript' column
data['transcript'] = data['transcript'].apply(remove_stopwords)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [15]:
# Step 5: Lemmatize Text
def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    return ' '.join(lemmatizer.lemmatize(word) for word in words)

data['transcript'] = data['transcript'].apply(lemmatize_text)


In [16]:
# Save the preprocessed dataset
save_path = r'/content/transcripts.csv'  # Path to save the cleaned dataset
data.to_csv(save_path, index=False)
print(f"Preprocessed dataset saved to {save_path}")


Preprocessed dataset saved to /content/transcripts.csv
