In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mt
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.feature_selection import chi2
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
from google.colab import files
files.upload()

Saving PCOS_data-1.csv to PCOS_data-1.csv


{'PCOS_data-1.csv': b'Patient File No., Age (yrs),Weight (Kg),Height(Cm) ,BMI,Blood Group,Pulse rate(bpm) ,RR (breaths/min),Hb(g/dl),Cycle(R/I),Cycle length(days),Marraige Status (Yrs),Pregnant(Y/N),No. of abortions,  I   beta-HCG(mIU/mL),II    beta-HCG(mIU/mL),FSH(mIU/mL),LH(mIU/mL),FSH/LH,Hip(inch),Waist(inch),Waist:Hip Ratio,TSH (mIU/L),AMH(ng/mL),PRL(ng/mL),Vit D3 (ng/mL),PRG(ng/mL),RBS(mg/dl),Weight gain(Y/N),hair growth(Y/N),Skin darkening (Y/N),Hair loss(Y/N),Pimples(Y/N),Fast food (Y/N),Reg.Exercise(Y/N),BP _Systolic (mmHg),BP _Diastolic (mmHg),Follicle No. (L),Follicle No. (R),Avg. F size (L) (mm),Avg. F size (R) (mm),Endometrium (mm),PCOS (Y/N)\r\n1,28,44.6,152,19.3,15,78,22,10.48,2,5,7,0,0,1.99,1.99,7.95,3.68,2.16,36,30,0.83,0.68,2.07,45.16,17.1,0.57,92,0,0,0,0,0,1,0,110,80,3,3,18,18,8.5,0\r\n2,36,65,161.5,24.9,15,74,20,11.7,2,5,11,1,0,60.8,1.99,6.73,1.09,6.17,38,32,0.84,3.16,1.53,20.09,61.3,0.97,92,0,0,0,0,0,0,0,120,70,3,5,15,14,3.7,0\r\n3,33,68.8,165,25.3,11,72,18,11.8,2,5

In [None]:
df= pd.read_csv('PCOS_data-1.csv')

# Pre-processing

In [None]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(value=df.mean(),inplace=True)
df.drop_duplicates()


In [None]:
x = df.iloc[:, 1:-1]
y = df.iloc[:,42]
x


# Chi2

In [None]:
chi_scores = chi2(x,y)
chi_scores

In [None]:
#p 0, high importance
chi_values = pd.Series(chi_scores[0],index=x.columns)
chi_values.sort_values(ascending=False,inplace=True)
chi_values.plot.bar()

In [None]:
chi_values

#extra-tree

In [None]:
model = ExtraTreesClassifier()
model.fit(x,y)

In [None]:
imp = pd.Series(model.feature_importances_,index=x.columns)
imp.sort_values(ascending=False,inplace=True)
imp

In [None]:
imp.plot.bar()

Train-test split of full dataset

In [None]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=.25,random_state=1)

## Random Forest with GridSearch

In [None]:
rf_clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 2, 8],
    'max_features': [0.2, 0.3, 1.0],
    'max_samples': [0.5, 0.75, 1.0],
    'random_state': [10, 20, 30]
}
rf_grid_search = GridSearchCV(rf_clf, param_grid, cv=2, return_train_score=False)
rf_grid_search.fit(xtrain, ytrain)

# Get the best parameters and best score from the grid search
best_params = rf_grid_search.best_params_
best_score = rf_grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)




Best Parameters: {'max_depth': 8, 'max_features': 0.2, 'max_samples': 1.0, 'n_estimators': 200, 'random_state': 20}
Best Score: 0.9036848266107399


top12 serially chi2 with grid search 92.6%

In [None]:
#top12 Feature of chi2
t12columnsl = ['Follicle No. (R)', 'Follicle No. (L)', 'Skin darkening (Y/N)', 'hair growth(Y/N)',
              'Weight gain(Y/N)', 'Weight (Kg)', 'Cycle(R/I)', 'Fast food (Y/N)', 'Pimples(Y/N)',
              'Marraige Status (Yrs)', ' Age (yrs)', 'BMI']
t12data = df[t12columnsl]

#Train-Test split of feature subset
t12xtrain, t12xtest, t12ytrain, t12ytest = train_test_split(t12data, y, test_size=.25, random_state=1)

#GridSearch for top 12 chi2
rf_clf_t12= RandomForestClassifier()
rf_grid_search_t12 = GridSearchCV(rf_clf_t12, param_grid, cv=2, return_train_score=False)
rf_grid_search_t12.fit(t12xtrain, t12ytrain)

#get the best parameter
best_params_t12 = rf_grid_search_t12.best_params_
best_score_t12= rf_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)

#evaluation
t12y_pred = rf_grid_search_t12.predict(t12xtest)
preC_t12 = precision_score(ytest, t12y_pred)
reC_t12 = recall_score(ytest, t12y_pred)
fscoreC_t12 = f1_score(ytest, t12y_pred)
accuracyC12 = accuracy_score(t12ytest, t12y_pred)

print("Accuracy:", accuracyC12)
print("Precision:", preC_t12)
print("Recall:", reC_t12)
print("F1 Score:", fscoreC_t12)

Best Parameters: {'max_depth': 8, 'max_features': 0.3, 'max_samples': 0.5, 'n_estimators': 50, 'random_state': 20}
Best Score: 0.9135614300346291
Accuracy: 0.9264705882352942
Precision: 0.925
Recall: 0.8409090909090909
F1 Score: 0.8809523809523809


top12 serially extra tree with gridsearch 90.4%

In [None]:
#top12 Feature of ExtraTree
t12columns2 = ['Follicle No. (R)','Follicle No. (L)', 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)']
t12data2 = df[t12columns2]

#Train-test split of feature subset
t12xtrain_e,t12xtest_e,t12ytrain_e,t12ytest_e = train_test_split(t12data2,y,test_size=.25,random_state=1)

#GridSearch for top 12 ExtraTree
rf_clf_t12= RandomForestClassifier()
rf_grid_search_t12 = GridSearchCV(rf_clf_t12, param_grid, cv=2, return_train_score=False)
rf_grid_search_t12.fit(t12xtrain_e, t12ytrain_e)

#get the best parameter
best_params_t12 = rf_grid_search_t12.best_params_
best_score_t12= rf_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)

#Evaluation
t12y_pred_e = rf_grid_search_t12.predict(t12xtest_e)
preE=precision_score(ytest,t12y_pred_e)
reE=recall_score(ytest,t12y_pred_e)
fscoreE=f1_score(ytest,t12y_pred_e)
accuracyE12 = accuracy_score(t12ytest_e, t12y_pred_e)
print("Accuracy:", accuracyE12)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)

Best Parameters: {'max_depth': None, 'max_features': 0.2, 'max_samples': 1.0, 'n_estimators': 200, 'random_state': 20}
Best Score: 0.9110861825098766
Accuracy: 0.9044117647058824
Precision: 0.918918918918919
Recall: 0.7727272727272727
F1 Score: 0.8395061728395061


top10 without ultra sound serially chi2 with grid search 83.8%

In [None]:
#top10 Feature of Chi2
t10columnsl = ['Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)','BMI']
t10data = df[t10columnsl]

#Train-test split of feature subset
t10xtrain,t10xtest,t10ytrain,t10ytest = train_test_split(t10data,y,test_size=.25,random_state=1)

#GridSearch for top 10 chi2
rf_clf_t10 = RandomForestClassifier()
rf_grid_search_t10 = GridSearchCV(rf_clf_t10, param_grid, cv=2, return_train_score=False)
rf_grid_search_t10.fit(t10xtrain, t10ytrain)

#get the best parameter
best_params_t10 = rf_grid_search_t10.best_params_
best_score_t10 = rf_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:", best_score_t10)

#Evaluation
t10y_pred = rf_grid_search_t10.predict(t10xtest)
preC=precision_score(ytest,t10y_pred)
reC=recall_score(ytest,t10y_pred)
fscoreC=f1_score(ytest,t10y_pred)
accuracyC10 = accuracy_score(t10ytest, t10y_pred)
print("Accuracy:", accuracyC10)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)

Best Parameters: {'max_depth': None, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 100, 'random_state': 10}
Best Score: 0.8641906062527436
Accuracy: 0.8382352941176471
Precision: 0.7619047619047619
Recall: 0.7272727272727273
F1 Score: 0.7441860465116279


top10 serially extra tree with grid search 83.8%

In [None]:
#top10 serially extra tree
t10columns2 = ['hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Hip(inch)','Marraige Status (Yrs)','Waist:Hip Ratio']
t10data2 = df[t10columns2]

#Train-test split of feature subset
t10xtrain_e, t10xtest_e, t10ytrain_e, t10ytest_e = train_test_split(t10data2, y, test_size=.25, random_state=1)

#GridSearch for top 10 extraTree
rf_clf_t10 = RandomForestClassifier()
rf_grid_search_t10 = GridSearchCV(rf_clf_t10, param_grid, cv=2, return_train_score=False)
rf_grid_search_t10.fit(t10xtrain_e, t10ytrain_e)

#get the best parameter
best_params_t10 = rf_grid_search_t10.best_params_
best_score_t10 = rf_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:", best_score_t10)

#Evaluation
t10y_pred = rf_grid_search_t10.predict(t10xtest_e)
preC_t10 = precision_score(ytest, t10y_pred)
reC_t10 = recall_score(ytest, t10y_pred)
fscoreC_t10= f1_score(ytest, t10y_pred)
accuracyC10 = accuracy_score(t10ytest_e, t10y_pred)

print("Accuracy:", accuracyC10)
print("Precision:", preC_t10)
print("Recall:", reC_t10)
print("F1 Score:", fscoreC_t10)

Best Parameters: {'max_depth': None, 'max_features': 0.2, 'max_samples': 0.5, 'n_estimators': 100, 'random_state': 10}
Best Score: 0.8641906062527436
Accuracy: 0.8382352941176471
Precision: 0.7619047619047619
Recall: 0.7272727272727273
F1 Score: 0.7441860465116279


## Gradient_Boost with GridSearch

In [None]:
gb_clf = GradientBoostingClassifier()
#Declaring parameters of GB
param_grid_gb = {
    'n_estimators':[10,20,30,40,50],
   'learning_rate':[0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1],
   'max_features':[2,3,4,5,6],
   'max_depth' :[2,3,4,5,6]

}
gb_grid_search = GridSearchCV(gb_clf, param_grid_gb, cv=5, return_train_score=False)


In [None]:
gb_grid_search.fit(xtrain, ytrain)

# Get the best parameters and best score from the grid search
best_params = gb_grid_search.best_params_
best_score = gb_grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'learning_rate': 0.25, 'max_depth': 5, 'max_features': 6, 'n_estimators': 40}
Best Score: 0.9086419753086419


top12 serially chi2 with grid search 90.4%

In [None]:
 #Top 12 Feature of Chi2
t12columnsl = ['Follicle No. (R)', 'Follicle No. (L)', 'Skin darkening (Y/N)', 'hair growth(Y/N)',
              'Weight gain(Y/N)', 'Weight (Kg)', 'Cycle(R/I)', 'Fast food (Y/N)', 'Pimples(Y/N)',
              'Marraige Status (Yrs)', ' Age (yrs)', 'BMI']
t12data = df[t12columnsl]

#Train-test splitting of the feature subset
t12xtrain, t12xtest, t12ytrain, t12ytest = train_test_split(t12data, y, test_size=.25, random_state=1)

#Gridsearch for top 12 feature of chi2
gb_clf_t12= GradientBoostingClassifier()
gb_grid_search_t12 = GridSearchCV(gb_clf_t12, param_grid_gb, cv=2, return_train_score=False)
gb_grid_search_t12.fit(t12xtrain, t12ytrain)

#get the best parameter
best_params_t12 = gb_grid_search_t12.best_params_
best_score_t12= gb_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)

#evaluation
t12y_pred = gb_grid_search_t12.predict(t12xtest)
preC=precision_score(ytest,t12y_pred)
reC=recall_score(ytest,t12y_pred)
fscoreC=f1_score(ytest,t12y_pred)
accuracyC12= accuracy_score(t12ytest, t12y_pred)
print("Accuracy:", accuracyC12)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)



Best Parameters: {'learning_rate': 0.075, 'max_depth': 2, 'max_features': 4, 'n_estimators': 40}
Best Score: 0.9110983758474369
Accuracy: 0.9044117647058824
Precision: 0.918918918918919
Recall: 0.7727272727272727
F1 Score: 0.8395061728395061


top12 serially extra tree with grid search 89.7%

In [None]:
#Top 12 Feature of ExtraTree
t12columns2 = ['Follicle No. (R)','Follicle No. (L)', 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)']
t12data2 = df[t12columns2]

#Train-test splitting of the feature subset
t12xtrain_e,t12xtest_e,t12ytrain_e,t12ytest_e = train_test_split(t12data2,y,test_size=.25,random_state=1)

#Gridsearch for top 12 feature of Extratree
gb_clf_t12= GradientBoostingClassifier()
gb_grid_search_t12 = GridSearchCV(gb_clf_t12, param_grid_gb, cv=2, return_train_score=False)
gb_grid_search_t12.fit(t12xtrain_e, t12ytrain_e)

#get the best parameter
best_params_t12 = gb_grid_search_t12.best_params_
best_score_t12= gb_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)

#Evaluation
t12y_pred = gb_grid_search_t12.predict(t12xtest_e)
preE=precision_score(ytest,t12y_pred)
reE=recall_score(ytest,t12y_pred)
fscoreE=f1_score(ytest,t12y_pred)
accuracyE12 = accuracy_score(t12ytest_e , t12y_pred)
print("Accuracy:", accuracyE12)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)



Best Parameters: {'learning_rate': 0.25, 'max_depth': 2, 'max_features': 6, 'n_estimators': 10}
Best Score: 0.9111105691849973
Accuracy: 0.8970588235294118
Precision: 0.9166666666666666
Recall: 0.75
F1 Score: 0.8250000000000001


top10 serially chi2 with grid search 85.29%

In [None]:
#top10 feature of Chi2
t10columnsl = ['Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)','BMI']
t10data = df[t10columnsl]

#Train-test split of feature subset
t10xtrain,t10xtest,t10ytrain,t10ytest = train_test_split(t10data,y,test_size=.25,random_state=1)

#Gridsearch for top 10 feature of Chi2
gb_clf_t10= GradientBoostingClassifier()
gb_grid_search_t10= GridSearchCV(gb_clf_t10, param_grid_gb, cv=2, return_train_score=False)
gb_grid_search_t10.fit(t10xtrain, t10ytrain)

#get the best parameter
best_params_t10 = gb_grid_search_t10.best_params_
best_score_t10= gb_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:",best_score_t10)

#Evaluation
t10y_pred = gb_grid_search_t10.predict(t10xtest)
preC=precision_score(ytest,t10y_pred)
reC=recall_score(ytest,t10y_pred)
fscoreC=f1_score(ytest,t10y_pred)
accuracyC10= accuracy_score(t10ytest, t10y_pred)
print("Accuracy:", accuracyC10)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)



Best Parameters: {'learning_rate': 0.1, 'max_depth': 2, 'max_features': 2, 'n_estimators': 40}
Best Score: 0.8592035311905575
Accuracy: 0.8529411764705882
Precision: 0.7857142857142857
Recall: 0.75
F1 Score: 0.7674418604651163


top10 serially ExtraTree with grid search 86%

In [None]:
#top10 feature of ExtraTree
t10columns2 = [ 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)','Waist:Hip Ratio']
t10data = df[t10columns2]

#Train-test split of feature subset
t10xtrain_e, t10xtest_e, t10ytrain_e, t10ytest_e = train_test_split(t10data2, y, test_size=.25, random_state=1)

#GridSearch for top 10 feature of extraTree
gb_clf_t10= GradientBoostingClassifier()
gb_grid_search_t10= GridSearchCV(gb_clf_t10, param_grid_gb, cv=2, return_train_score=False)
gb_grid_search_t10.fit(t10xtrain_e, t10ytrain_e)

#get the best parameters
best_params_t10 = gb_grid_search_t10.best_params_
best_score_t10= gb_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:",best_score_t10)

#Evaluation
t10y_pred = gb_grid_search_t10.predict(t10xtest_e)
preE=precision_score(ytest,t10y_pred)
reE=recall_score(ytest,t10y_pred)
fscoreE=f1_score(ytest,t10y_pred)
accuracyE10 = accuracy_score(t10ytest_e, t10y_pred)
print("Accuracy:", accuracyE10)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)



Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'max_features': 2, 'n_estimators': 30}
Best Score: 0.8592644978783592
Accuracy: 0.8602941176470589
Precision: 0.7906976744186046
Recall: 0.7727272727272727
F1 Score: 0.7816091954022988


## AdaBoost with GridSearch

In [None]:
from sklearn.ensemble import AdaBoostClassifier
ad_clf= AdaBoostClassifier()
param_grid_ad = {
    'n_estimators': [100, 200,300],
    'learning_rate': [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1],
    'random_state': [20,30,40]

}
AdaModel_grid_search = GridSearchCV(ad_clf, param_grid_ad, cv=2, return_train_score=False)

In [None]:
AdaModel_grid_search.fit(xtrain, ytrain)

# Get the best parameters and best score from the grid search
best_params = AdaModel_grid_search.best_params_
best_score = AdaModel_grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'learning_rate': 0.05, 'n_estimators': 100, 'random_state': 20}
Best Score: 0.8938082231868507


In [None]:
ad_clf2=AdaBoostClassifier(n_estimators=300,learning_rate=0.25,random_state=20)
model=ad_clf2.fit(xtrain, ytrain)
ypred=model.predict(xtest)
print("Accuracy:",metrics.accuracy_score(ytest,ypred))

Accuracy: 0.8897058823529411


top12 serially extra tree 91%

In [None]:
#Top 12 feature of extraTree
t12columns2 = ['Follicle No. (R)','Follicle No. (L)', 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)']
t12data = df[t12columns2]

#Train-test split of feature subset
t12xtrain_e,t12xtest_e,t12ytrain_e,t12ytest_e = train_test_split(t12data2,y,test_size=.25,random_state=1)

ad_clf_t12= AdaBoostClassifier()
ad_grid_search_t12= GridSearchCV(ad_clf_t12, param_grid_ad, cv=2, return_train_score=False)
ad_grid_search_t12.fit(t12xtrain_e, t12ytrain_e)

#get the best parameters
best_params_t12 = ad_grid_search_t12.best_params_
best_score_t12= ad_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)

#Evaluation
t12y_pred = ad_grid_search_t12.predict(t12xtest_e)
preE=precision_score(ytest,t12y_pred)
reE=recall_score(ytest,t12y_pred)
fscoreE=f1_score(ytest,t12y_pred)
accuracyE = accuracy_score(t12ytest_e, t12y_pred)
print("Accuracy:", accuracyE)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)

Best Parameters: {'learning_rate': 0.075, 'n_estimators': 100, 'random_state': 20}
Best Score: 0.9037214066234209
Accuracy: 0.9117647058823529
Precision: 0.9210526315789473
Recall: 0.7954545454545454
F1 Score: 0.8536585365853658


top12 serially chi2 90.4%

In [None]:
#Top 12 Feature of Chi2
t12columnsl = ['Follicle No. (R)', 'Follicle No. (L)', 'Skin darkening (Y/N)', 'hair growth(Y/N)',
              'Weight gain(Y/N)', 'Weight (Kg)', 'Cycle(R/I)', 'Fast food (Y/N)', 'Pimples(Y/N)',
              'Marraige Status (Yrs)', ' Age (yrs)', 'BMI']
t12data = df[t12columnsl]

#Train-test splitting of the feature subset
t12xtrain, t12xtest, t12ytrain, t12ytest = train_test_split(t12data, y, test_size=.25, random_state=1)

#Gridsearch for top 12 feature of chi2
ad_clf_t12= AdaBoostClassifier()
ad_grid_search_t12 = GridSearchCV(ad_clf_t12, param_grid_ad, cv=2, return_train_score=False)
ad_grid_search_t12.fit(t12xtrain, t12ytrain)

#get the best parameter
best_params_t12 = ad_grid_search_t12.best_params_
best_score_t12= ad_grid_search_t12.best_score_
print("Best Parameters:", best_params_t12)
print("Best Score:",best_score_t12)
ad_grid_search_t12.fit(t12xtrain, t12ytrain)

#Evaluation
t12y_pred = ad_grid_search_t12.predict(t12xtest)
preC=precision_score(ytest,t12y_pred)
reC=recall_score(ytest,t12y_pred)
fscoreC=f1_score(ytest,t12y_pred)
accuracyC = accuracy_score(t12ytest, t12y_pred)
print("Accuracy:", accuracyC)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)

Best Parameters: {'learning_rate': 0.05, 'n_estimators': 100, 'random_state': 20}
Best Score: 0.9111105691849973
Accuracy: 0.9044117647058824
Precision: 0.918918918918919
Recall: 0.7727272727272727
F1 Score: 0.8395061728395061


Top 10 Chi2 with GridSearch 83.8%

In [None]:
#top10 Feature of chi2
t10columnsl = ['Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)','BMI']
t10data = df[t10columnsl]

#Train-test split of feature subset
t10xtrain,t10xtest,t10ytrain,t10ytest = train_test_split(t10data,y,test_size=.25,random_state=1)

#GridSearch for top 10 chi2
ad_clf_t10= AdaBoostClassifier()
ad_grid_search_t10 = GridSearchCV(ad_clf_t10, param_grid_ad, cv=2, return_train_score=False)
ad_grid_search_t10.fit(t10xtrain, t10ytrain)

#get the best parameter
best_params_t10 = ad_grid_search_t10.best_params_
best_score_t10= ad_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:",best_score_t10)
ad_grid_search_t10.fit(t10xtrain, t10ytrain)

#Evaluation
t10y_pred = ad_grid_search_t10.predict(t10xtest)
preC=precision_score(ytest,t10y_pred)
reC=recall_score(ytest,t10y_pred)
fscoreC=f1_score(ytest,t10y_pred)
accuracyC = accuracy_score(t10ytest, t10y_pred)
print("Accuracy:", accuracyC)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)

Best Parameters: {'learning_rate': 0.075, 'n_estimators': 300, 'random_state': 20}
Best Score: 0.8419621518802126
Accuracy: 0.8382352941176471
Precision: 0.7619047619047619
Recall: 0.7272727272727273
F1 Score: 0.7441860465116279


Top 10 ExtraTree with GridSearch 84.5%

In [None]:
#top10 feature of ExtraTree
t10columns2 = [ 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)','Waist:Hip Ratio']
t10data = df[t10columns2]

#Train-test split of feature subset
t10xtrain_e, t10xtest_e, t10ytrain_e, t10ytest_e = train_test_split(t10data2, y, test_size=.25, random_state=1)

#GridSearch for top 10 feature of extraTree
ad_clf_t10= AdaBoostClassifier()
ad_grid_search_t10= GridSearchCV(ad_clf_t10, param_grid_ad, cv=2, return_train_score=False)
ad_grid_search_t10.fit(t10xtrain_e, t10ytrain_e)

#get the best parameters
best_params_t10 = ad_grid_search_t10.best_params_
best_score_t10= ad_grid_search_t10.best_score_
print("Best Parameters:", best_params_t10)
print("Best Score:",best_score_t10)

#Evaluation
t10y_pred = ad_grid_search_t10.predict(t10xtest_e)
preE=precision_score(ytest,t10y_pred)
reE=recall_score(ytest,t10y_pred)
fscoreE=f1_score(ytest,t10y_pred)
accuracyE10 = accuracy_score(t10ytest_e, t10y_pred)
print("Accuracy:", accuracyE10)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)



Best Parameters: {'learning_rate': 0.075, 'n_estimators': 100, 'random_state': 20}
Best Score: 0.8444617860800858
Accuracy: 0.8455882352941176
Precision: 0.7804878048780488
Recall: 0.7272727272727273
F1 Score: 0.7529411764705882


## SVC

In [None]:
from sklearn.svm import SVC
svc1= SVC()
param_grid_svc = {
     'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
    'gamma': [1, 0.1, 0.01]

}
svc_grid_search = GridSearchCV(svc1, param_grid_svc, cv=5, return_train_score=False)

In [None]:
svc_grid_search.fit(xtrain, ytrain)

# Get the best parameters and best score from the grid search
best_params = svc_grid_search.best_params_
best_score = svc_grid_search.best_score_
print("Best Parameters:", best_params)
print("Best Score:", best_score)

In [None]:
svc2=SVC(C=1,kernel='linear',gamma=0.1)
model=svc2.fit(xtrain, ytrain)
ypred=model.predict(xtest)
print("Accuracy:",metrics.accuracy_score(ytest,ypred))

Accuracy: 0.8455882352941176


In [None]:
#top12 serially extra tree with grid search 91.17%
t5columnsl = ['Follicle No. (R)','Follicle No. (L)', 'hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)']
t5data = df[t5columnsl]

t5xtrain,t5xtest,t5ytrain,t5ytest = train_test_split(t5data,y,test_size=.25,random_state=1)
model.fit(t5xtrain, t5ytrain)
t5y_pred = model.predict(t5xtest)
preE=precision_score(ytest,t5y_pred)
reE=recall_score(ytest,t5y_pred)
fscoreE=f1_score(ytest,t5y_pred)
accuracyE = accuracy_score(t5ytest, t5y_pred)
print("Accuracy:", accuracyE)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)

Accuracy: 0.9044117647058824
Precision: 0.8974358974358975
Recall: 0.7954545454545454
F1 Score: 0.8433734939759037


In [None]:
#top12 serially chi2 with gridsearch 93.3%
t5columnsl = ['Follicle No. (R)','Follicle No. (L)','Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)','BMI']
t5data = df[t5columnsl]

t5xtrain,t5xtest,t5ytrain,t5ytest = train_test_split(t5data,y,test_size=.25,random_state=1)
model.fit(t5xtrain, t5ytrain)
t5y_pred = model.predict(t5xtest)
preC=precision_score(ytest,t5y_pred)
reC=recall_score(ytest,t5y_pred)
fscoreC=f1_score(ytest,t5y_pred)
accuracyC = accuracy_score(t5ytest, t5y_pred)
print("Accuracy:", accuracyC)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)

Accuracy: 0.9338235294117647
Precision: 0.9487179487179487
Recall: 0.8409090909090909
F1 Score: 0.891566265060241


In [None]:
#top10 serially chi2 83%
t5columnsl = ['Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)','BMI']
t5data = df[t5columnsl]

t5xtrain,t5xtest,t5ytrain,t5ytest = train_test_split(t5data,y,test_size=.25,random_state=1)
model.fit(t5xtrain, t5ytrain)
t5y_pred = model.predict(t5xtest)
preC=precision_score(ytest,t5y_pred)
reC=recall_score(ytest,t5y_pred)
fscoreC=f1_score(ytest,t5y_pred)
accuracyC = accuracy_score(t5ytest, t5y_pred)
print("Accuracy:", accuracyC)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)

Accuracy: 0.8455882352941176
Precision: 0.7446808510638298
Recall: 0.7954545454545454
F1 Score: 0.7692307692307692


In [None]:
#top10 serially extra tree 85%
t5columnsl = ['hair growth(Y/N)','Skin darkening (Y/N)', 'Weight gain(Y/N)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Cycle length(days)','Avg. F size (L) (mm)',
'Hip(inch)','Marraige Status (Yrs)','Waist:Hip Ratio']
t5data = df[t5columnsl]

t5xtrain,t5xtest,t5ytrain,t5ytest = train_test_split(t5data,y,test_size=.25,random_state=1)
model.fit(t5xtrain, t5ytrain)
t5y_pred = model.predict(t5xtest)
preE=precision_score(ytest,t5y_pred)
reE=recall_score(ytest,t5y_pred)
fscoreE=f1_score(ytest,t5y_pred)
accuracyE = accuracy_score(t5ytest, t5y_pred)
print("Accuracy:", accuracyE)
print("Precision:",preE)
print("Recall:",reE)
print("F1 Score:",fscoreE)

Accuracy: 0.8529411764705882
Precision: 0.7727272727272727
Recall: 0.7727272727272727
F1 Score: 0.7727272727272727


In [None]:
#9 common non-invasive feature from chi and extra-tree with gird search 85%
t5columnsl = ['Skin darkening (Y/N)', 'hair growth(Y/N)', 'Weight gain(Y/N)','Weight (Kg)','Cycle(R/I)', 'Fast food (Y/N)','Pimples(Y/N)', 'Marraige Status (Yrs)',
' Age (yrs)']
t5data = df[t5columnsl]

t5xtrain,t5xtest,t5ytrain,t5ytest = train_test_split(t5data,y,test_size=.25,random_state=1)
model.fit(t5xtrain, t5ytrain)
t5y_pred = model.predict(t5xtest)
preC=precision_score(ytest,t5y_pred)
reC=recall_score(ytest,t5y_pred)
fscoreC=f1_score(ytest,t5y_pred)
accuracyC10 = accuracy_score(t5ytest, t5y_pred)
print("Accuracy:", accuracyC10)
print("Precision:",preC)
print("Recall:",reC)
print("F1 Score:",fscoreC)


Accuracy: 0.8455882352941176
Precision: 0.7446808510638298
Recall: 0.7954545454545454
F1 Score: 0.7692307692307692
