In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

# Download the stopwords and WordNetLemmatizer data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the full dataset from the CSV file
data = pd.read_csv('dataset.csv')

# Sample 50% of the dataset randomly
sampled_data = data.sample(frac=0.5, random_state=42)

# Define a function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Perform stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Join the tokens back to form the preprocessed text
    processed_text = ' '.join(tokens)

    return processed_text

# Preprocess the transcript data
sampled_data['transcript'] = sampled_data['transcript'].apply(preprocess_text)

X = sampled_data['transcript']  # Transcript data
y = sampled_data['topic']  # Disease labels

# Split the sampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature extraction using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Build and train the SVM model
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_vectorized, y_train)

# Make predictions and evaluate the model
y_pred = svm_model.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nileshpal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nileshpal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 0.855879650895728
              precision    recall  f1-score   support

        ADHD       0.88      0.89      0.88      2044
         OCD       0.91      0.91      0.91      2576
   aspergers       0.78      0.78      0.78      1286
  depression       0.77      0.83      0.80      1440
        ptsd       0.89      0.82      0.85      1362

    accuracy                           0.86      8708
   macro avg       0.85      0.84      0.84      8708
weighted avg       0.86      0.86      0.86      8708

