In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, f1_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import pickle
import os
import time

nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    text = re.sub("[^a-zA-Z]", " ", text)
    text = text.lower()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(cleaned_words)

def load_data(filename):
    df = pd.read_csv(filename)
    df['statement'] = df['statement'].apply(clean_text)
    return df

def run_model(filename):
    data = load_data(filename)
    
    X_train, X_test, y_train, y_test = train_test_split(data['statement'], data['speaker'], test_size=0.2, random_state=42, shuffle=True)
    
    start_time = time.time()
    
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', RandomForestClassifier(n_estimators=10, random_state=42))
    ])
    
    text_clf.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    
    y_pred = text_clf.predict(X_test)
    
    # Evaluation metrics
    train_accuracy = text_clf.score(X_train, y_train)
    test_accuracy = accuracy_score(y_test, y_pred)
    # f_score = f1_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)

    print("Evaluation Metrics:")
    print(f"Train Accuracy: {train_accuracy:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")
    # print(f"F-score: {f_score:.4f}")
    print("\nClassification Report:")
    print(classification_rep)

    # Serialize the model
    model_name = "random_forest_model.pkl"
    with open(model_name, 'wb') as model_file:
        pickle.dump(text_clf, model_file)

    # Get the size of the model file
    model_size = os.path.getsize(model_name)
    print(f"Model size: {model_size} bytes")
    print(f"Training time: {training_time:.4f} seconds")

run_model('data_cleaned.csv')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Evaluation Metrics:
Train Accuracy: 0.9992
Test Accuracy: 0.9656

Classification Report:
              precision    recall  f1-score   support

      Doctor       0.96      0.97      0.97     51179
     Patient       0.97      0.96      0.97     51789

    accuracy                           0.97    102968
   macro avg       0.97      0.97      0.97    102968
weighted avg       0.97      0.97      0.97    102968

Model size: 123060035 bytes
Training time: 1022.0131 seconds
