In [None]:
import pandas as pd
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from google.colab import files

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


def load_dataset():
    print("Please upload your dataset.")
    uploaded = files.upload()
    file_name = list(uploaded.keys())[0]
    data = pd.read_csv(file_name)
    print(f"Dataset Loaded. Shape: {data.shape}")
    return data

def clean_data(data, text_column):
    data = data.drop_duplicates()
    data = data.dropna(subset=[text_column])
    return data

def preprocess_text(data, text_column):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    def normalize(text):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = ''.join([char for char in text if not char.isdigit()])
        return text

    def tokenize_and_process(text):
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [stemmer.stem(word) for word in tokens]
        return ' '.join(tokens)

    data[text_column] = data[text_column].apply(normalize)
    data[text_column] = data[text_column].apply(tokenize_and_process)
    return data

def vectorize_text(data, text_column):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(data[text_column])
    return vectors, vectorizer

def save_preprocessed_data(data, output_path):
    data.to_csv(output_path, index=False)
    files.download(output_path)
    print(f"Preprocessed data saved and downloaded as {output_path}")

def main():
    text_column = 'text_'
    output_path = 'preprocessed_fake_review_data.csv'

    data = load_dataset()
    data = clean_data(data, text_column)
    data = preprocess_text(data, text_column)
    vectors, vectorizer = vectorize_text(data, text_column)
    save_preprocessed_data(data, output_path)

if __name__ == "__main__":
    main()


Please upload your dataset.


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Saving fakeReviewData.csv to fakeReviewData (2).csv
Dataset Loaded. Shape: (40432, 4)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Preprocessed data saved and downloaded as preprocessed_fake_review_data.csv
