In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Project-Name/text_classification/data/raw/text_dataset.csv')

# Check the first few rows
print(data.head())

# Preprocess the text data
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

data['text'] = data['text'].apply(preprocess_text)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Save the processed data
processed_data_path = '/content/drive/MyDrive/Project-Name/text_classification/data/processed/'
X_train_df = pd.DataFrame(X_train_tfidf.toarray())
X_test_df = pd.DataFrame(X_test_tfidf.toarray())
X_train_df.to_csv(processed_data_path + 'X_train.csv', index=False)
X_test_df.to_csv(processed_data_path + 'X_test.csv', index=False)
pd.DataFrame(y_train).to_csv(processed_data_path + 'y_train.csv', index=False)
pd.DataFrame(y_test).to_csv(processed_data_path + 'y_test.csv', index=False)

print("Data preprocessing completed and saved.")
