###  Schizophrenia Diagonisis

In [None]:
## loading dataset 
import pandas as pd
import numpy as np
df=pd.read_csv('schizophrenia_dataset.csv')
df.head()


In [None]:
## handling missing values if any 
df.dropna().reset_index(drop=True)

In [None]:
df.shape

In [None]:
## check if any datatype is in int or not 
## if any the we have to change the datatype for analysis
df.info()
## there is no other dtype rather than int 

In [None]:
## changing the column name into english names for better understanding
## encoding of column names
df.columns=['Patient_ID','Age','Gender','Education_Level','Marital_Status','Occupation','Income_Level','Living_Area ','Diagnosis','Disease_Duration','Hospitalizations','Family_History ','Substance_Use','Suicide_Attempt','Positive_Symptom_Score','Negative_Symptom_Score','GAF_Score','Social_Support','Stress_Factors','Medication_Adherence']


In [None]:
df.head()

In [None]:
df.isnull().sum()

##### there is no need of label encoding as the dataset is already encoded if it not then it needed label encoding 

In [None]:
X=df.drop(columns=['Patient_ID','Diagnosis'])
y=df['Diagnosis']
X.columns


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,y,test_size=0.20,random_state=42)
X_train.shape
X_train.columns

In [None]:
from sklearn.preprocessing import StandardScaler

num_features = ['Age', 'Positive_Symptom_Score', 'Negative_Symptom_Score', 'GAF_Score','Disease_Duration']
scaler = StandardScaler()
X_train[num_features]=scaler.fit_transform(X_train[num_features])
X_test[num_features]=scaler.transform(X_test[num_features])

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
rfe = RFE(logistic, n_features_to_select=10)  # Select top 10 features
X_train_ref = rfe.fit_transform(X_train, Y_train)
X_test_ref = rfe.transform(X_test)

print("Selected Features:", X.columns[rfe.support_])


In [None]:
from sklearn.linear_model import LogisticRegression
logistic=LogisticRegression()
logistic.fit(X_train_ref,Y_train)
y_pred=logistic.predict(X_test_ref)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
score=accuracy_score(y_pred,Y_test)
con_max=confusion_matrix(y_pred,Y_test)
clss=classification_report(y_pred,Y_test)
print("accuracy score:",score)
print("confusion matrix: ",con_max)
print("classification report",clss)

##### hyper tunning

In [None]:
## find the best hyperparameters for a machine learning model and increase accuracy
model=LogisticRegression()
penalty=['l1','l2','elasticnet']
c_values=[100,10,1.0,0.1,0.01]
solver=['newton-cg','lbfgs','sag','saga']
params=dict(penalty=penalty,C=c_values,solver=solver)

In [None]:
from sklearn.model_selection import StratifiedKFold
cv=StratifiedKFold(n_splits=5)
from sklearn.model_selection import GridSearchCV
grid=GridSearchCV(estimator=model,param_grid=params,cv=cv,n_jobs=-1,verbose=1,scoring='accuracy')


In [None]:
grid.fit(X_train_ref,Y_train)
y_pred=grid.predict(X_test_ref)
grid.best_params_
## o/p {'C': 100, 'penalty': 'l1', 'solver': 'saga'}

In [None]:
score=accuracy_score(y_pred,Y_test)
con_max=confusion_matrix(y_pred,Y_test)
clss=classification_report(y_pred,Y_test)
print("accuracy score:",score)
print("confusion matrix: ",con_max)
print("classification report",clss)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
plt.figure(figsize=(6, 4))
sns.heatmap(con_max, annot=True, fmt='d', cmap='Blues', xticklabels=['Not Schizophrenic', 'Schizophrenic'],
                yticklabels=['Not Schizophrenic', 'Schizophrenic'])
plt.title('logistic Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()