<a href="https://colab.research.google.com/github/MELRIAN1910/MachineLearning/blob/main/Practice%20Projects/Alzheimer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import dependencies
import numpy as np
import re
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [None]:
#Load dataset
dataset = pd.read_csv('/content/alzheimers.csv')
print(dataset)

                                                Speech     Stage
0    This morning, I ate breakfast and later ate br...     Early
1    The day... uh... started with visited a friend...  Moderate
2    read a book... um... I don't know... uh... wha...    Severe
3    My day started with ate breakfast, followed by...     Early
4    I... um... woke up and then uh... worked on a ...  Moderate
..                                                 ...       ...
505  visited a friend um... was in the morning, the...  Moderate
506  I... uh... was... uh... going to visited a fri...    Severe
507  I remember listened to music in the morning, t...     Early
508  The day... uh... started with woke up, then......  Moderate
509  Something... about... uh... went shopping I th...    Severe

[510 rows x 2 columns]


In [None]:
#num of rows and columns
dataset.shape

(510, 2)

In [None]:
#check null
dataset.isnull().sum()

Unnamed: 0,0
Speech,0
Stage,0


In [None]:
#Duplicate data
dataset['Speech'].value_counts()

Unnamed: 0_level_0,count
Speech,Unnamed: 1_level_1
I... uh... was... uh... going to read a book but... uh... I forgot.,10
read a book... um... I don't know... uh... what... where... I am.,6
watched TV... um... I don't know... uh... what... where... I am.,6
cooked dinner... um... I don't know... uh... what... where... I am.,6
worked on a project... um... I don't know... uh... what... where... I am.,5
...,...
"I remember walked in the park in the morning, then I woke up and later walked in the park.",1
Uh... listened to music... uh... something... uh... ate breakfast and then... uh... read a book?,1
"The day... uh... started with worked on a project, then... um... ate breakfast and later worked on a project.",1
"I remember went shopping in the morning, then I cooked dinner and later worked on a project.",1


In [None]:
# Remove duplicates
dataset.drop_duplicates(inplace=True)
dataset['Speech'].value_counts()

Unnamed: 0_level_0,count
Speech,Unnamed: 1_level_1
"This morning, I ate breakfast and later ate breakfast before listened to music in the evening.",1
"I remember cooked dinner in the morning, then I cooked dinner and later cooked dinner.",1
Uh... worked on a project... uh... something... uh... woke up and then... uh... walked in the park?,1
"woke up um... was in the morning, then I uh... watched TV and later... visited a friend.",1
I had a great time read a book with went shopping and then cooked dinner.,1
...,...
I... um... visited a friend and then uh... worked on a project before uh... went shopping.,1
"I remember visited a friend in the morning, then I went shopping and later visited a friend.",1
I... um... went shopping and then uh... watched TV before uh... listened to music.,1
Today I visited a friend and then visited a friend before heading to walked in the park.,1


In [None]:
# Function to count repeated words and filler words in the speech
def extract_features(speech):
    # Convert speech to lowercase and remove non-word characters except for pauses ("...").
    cleaned_speech = re.sub(r'[^\w\s\.\']', '', speech.lower())

    # Identify repeated words using regex (e.g., "I I" or "I... I").
    repeated_words = len(re.findall(r'\b(\w+)\b\s+\1', cleaned_speech))

    # Count pauses (e.g., "um", "uh", "...") which can indicate hesitation.
    filler_words = len(re.findall(r'\bum\b|\buh\b|\.{2,}', cleaned_speech))

    # Return the extracted features
    return repeated_words, filler_words

In [None]:
# Apply the feature extraction to each row in the dataset
dataset['Repeated_Words'], dataset['Filler_Words'] = zip(*dataset['Speech'].apply(extract_features))

In [None]:
#num of rows and columns
dataset.shape

(432, 4)

In [None]:
# Show dataset with new features
dataset.head()

Unnamed: 0,Speech,Stage,Repeated_Words,Filler_Words
0,"This morning, I ate breakfast and later ate br...",Early,0,0
1,The day... uh... started with visited a friend...,Moderate,0,6
2,read a book... um... I don't know... uh... wha...,Severe,0,8
3,"My day started with ate breakfast, followed by...",Early,0,0
4,I... um... woke up and then uh... worked on a ...,Moderate,0,7


# Label Encoding

In [None]:
# Convert categorical target variable to numerical labels
label_encoder = LabelEncoder()
dataset['Stage_Label'] = label_encoder.fit_transform(dataset['Stage'])

In [None]:
# Extract features and target variable
X = dataset[['Repeated_Words', 'Filler_Words']]
Y = dataset['Stage_Label']

In [None]:
print(X)

     Repeated_Words  Filler_Words
0                 0             0
1                 0             6
2                 0             8
3                 0             0
4                 0             7
..              ...           ...
505               0             5
506               0             9
507               0             0
508               0             6
509               0             5

[432 rows x 2 columns]


In [None]:
print(Y)

0      0
1      1
2      2
3      0
4      1
      ..
505    1
506    2
507    0
508    1
509    2
Name: Stage_Label, Length: 432, dtype: int64


# Train Test Split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(X.shape, X_train.shape, X_test.shape)

(432, 2) (345, 2) (87, 2)


# Build the model

In [None]:
# Initialize the RandomForestClassifier model
rf_model = RandomForestClassifier(random_state=42)

In [None]:
# Train the model on the training data
rf_model.fit(X_train, Y_train)

In [None]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)

In [None]:
# Evaluate the model
accuracy = accuracy_score(Y_test, y_pred)
print(f'Model accuracy: {accuracy}')

Model accuracy: 0.9770114942528736


In [None]:
import pickle

# Save the trained RandomForest model
with open('alzheimers_rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)

# Save the LabelEncoder to transform target labels
with open('alzheimers_label_encoder.pkl', 'wb') as encoder_file:
    pickle.dump(label_encoder, encoder_file)

# Optionally, save the feature extraction function if needed
with open('extract_features.pkl', 'wb') as feature_file:
    pickle.dump(extract_features, feature_file)


# Build a Predictive System

In [None]:
# Function to predict Alzheimer's stage for a new transcription
def predict_stage(transcription):
    # Extract features for the new transcription
    repeated_words, filler_words = extract_features(transcription)
    features = pd.DataFrame([[repeated_words, filler_words]], columns=['Repeated_Words', 'Filler_Words'])

    # Make prediction
    stage_label = rf_model.predict(features)[0]
    stage = label_encoder.inverse_transform([stage_label])[0]

    return stage

In [None]:
# Example: Predict the stage for a new transcription
new_transcription = "I... uh... went to the... um... what was it... the place with... uh... people?"
predicted_stage = predict_stage(new_transcription)
print("Predicted Alzheimer's stage:" + predicted_stage)

Predicted Alzheimer's stage:Severe


In [None]:
!pip install openai-whisper
!sudo apt update && sudo apt install ffmpeg

Collecting openai-whisper
  Downloading openai-whisper-20240930.tar.gz (800 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/800.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m204.8/800.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m798.7/800.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m800.5/800.5 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->openai-whisper)
  Downloadin

In [None]:
from google.colab import files

uploaded = files.upload()  # Upload an audio file manually
audio_file = list(uploaded.keys())[0]  # Get the file name

Saving WhatsApp Audio 2025-03-16 at 9.02.59 PM.mp3 to WhatsApp Audio 2025-03-16 at 9.02.59 PM.mp3


In [None]:
import whisper

model = whisper.load_model("medium")  # "medium" or "large" gives better results
result = model.transcribe(audio_file, word_timestamps=True)  # Enables word-level timestamps



In [None]:
import re

# Extract words & timestamps
words = result["segments"]
new_transcription = ""

for segment in words:
    for word_info in segment["words"]:
        # Change 'text' to 'word'
        word = word_info["word"]
        start = word_info["start"]
        end = word_info["end"]

        # Detect pauses (longer than 0.6 seconds)
        if new_transcription and (start - prev_end) > 0.6:
            new_transcription += " uh.... "

         # Replace "a..." with "uh..."
        word = re.sub(r'\ba\.\.\.', "uh...", word)  # \b ensures it replaces only full words

        new_transcription += word + " "
        prev_end = end

print("Final Transcription:", new_transcription)

Final Transcription:  I  was  looking  for,  I  don't  know  what,  uh....  maybe  my  phone. 


In [None]:
new_transcription = "I... I... uh... was looking for... um... I don't know what... uh... maybe my phone?"
predicted_stage = predict_stage(new_transcription)
print("Predicted Alzheimer's stage:" + predicted_stage)

Predicted Alzheimer's stage:Severe


In [None]:
print(new_transcription)

I... I... uh... was looking for... um... I don't know what... uh... maybe my phone?
