In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    
    text = text.lower()
    
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(cleaned_words)

def load_data(filename):
    df = pd.read_csv(filename)
    df['statement'] = df['statement'].apply(clean_text)
    return df

def run_model(filename):
    data = load_data(filename)
    
    X_train, X_test, y_train, y_test = train_test_split(data['statement'], data['speaker'], test_size=0.2, random_state=42,shuffle=True)
    
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_estimators=10,random_state=42))
    ])
    
    text_clf.fit(X_train, y_train)
    
    y_pred = text_clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.4f}')


run_model('ML Models/data_cleaned.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.9656


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
import re
import pickle
import os
import time

def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    return text

def load_data(filename):
    df = pd.read_csv(filename)
    df['statement'] = df['statement'].apply(clean_text)
    return df

def run_model(filename):
    data = load_data(filename)
    
    X_train, X_test, y_train, y_test = train_test_split(data['statement'], data['speaker'], test_size=0.2, random_state=42, shuffle=True)
    
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('clf', RandomForestClassifier(n_estimators=10, random_state=42))
    ])
    
    text_clf.fit(X_train, y_train)
    
    y_pred = text_clf.predict(X_test)
    
    # Evaluation metrics
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    f_score = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    print("Evaluation Metrics:")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_rep)
    print(f"F-score: {f_score:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")

    # Serialize the model and get its size
    model_name = "random_forest_model.pkl"
    with open(model_name, 'wb') as model_file:
        pickle.dump(text_clf, model_file)

    # Get the size of the model file
    model_size = os.path.getsize(model_name)
    print(f"Model size: {model_size} bytes")

run_model('data_cleaned.csv')


NameError: name 'Pipeline' is not defined