In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from Bagging import Custom_Bagging
from Boosting import Custom_Boosting
from Random_forest import Custom_RandomForest
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score



In [21]:
data = pd.read_csv('your_data.csv')
data = data.iloc[:, 1:]
X = data.drop(columns=['smoking']) 
y = data['smoking']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
column_name = ['eyesight(left)', 'height(cm)', 'hemoglobin', 'HDL', 'serum creatinine', 'hearing(left)', 'waist(cm)', 'fasting blood sugar', 'eyesight(right)']
for column in column_name:
    if column in data.columns:
        plt.figure(figsize=(8, 4))
        sns.histplot(data[column], kde=True)
        plt.title(f'Distribution of {column}')
        plt.xlabel(column)
        plt.show()

        # Boxplot
        sns.boxplot(x=data[column], color='lightblue')
        plt.title(f'Boxplot of {column}')
        plt.show()

# Descriptive statistics
print(data.describe())

In [None]:
print("Bivariate Analysis")
numerical_columns = X_train.select_dtypes(include=['int64', 'float64']).columns
plt.figure(figsize=(12, 8))
sns.heatmap(data[numerical_columns].corr(), annot=True, cmap="coolwarm")
plt.title('Correlation Matrix')
plt.show()

sns.pairplot(data[numerical_columns])
plt.show()



In [None]:
# PCA example for dimensionality reduction
from sklearn.preprocessing import StandardScaler

# Standardizing the data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(X_train[numerical_columns])

# Apply PCA
pca = PCA(n_components=2)
pca_result = pca.fit_transform(scaled_data)

# PCA results visualization
plt.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.7, color='purple')
plt.title('PCA of Features')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()



In [25]:
#droping unrelevant features 
X_train= X_train.drop(['eyesight(left)', 'eyesight(right)', 'hearing(left)', 'id'], axis=1)
X_test= X_test.drop(['eyesight(left)', 'eyesight(right)', 'hearing(left)', 'id'], axis=1) 

In [None]:
# Check if there are missing values in each column
missing_values = X_train.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)


In [None]:
# IQR method to detect outliers
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = ((X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR)))
# Extract and print only the outlier values
outlier_values = X_train[outliers]

# Drop NaN rows to list only the outliers
outlier_values_clean = outlier_values.dropna(how='all')
print("Clean Outliers (Non-NaN rows):")
print(outlier_values_clean)

In [None]:
def count_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return ((series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))).sum()

# Count outliers for each column
outliers_count = X_train.apply(count_outliers)
print("Outliers Count per Column:")
print(outliers_count)

In [None]:
import numpy as np
# Define a function to handle outliers in each column
def winsorize(X_train, feature, lower_percentile=5, upper_percentile=95):

    lower_bound = np.percentile(X_train[feature], lower_percentile)
    upper_bound = np.percentile(X_train[feature], upper_percentile)
    
    # Cap the outliers
    X_train[feature] = np.clip(data[feature], lower_bound, upper_bound)

feautues = ['height(cm)', 'hemoglobin', 'HDL', 'serum creatinine', 'waist(cm)', 'fasting blood sugar']
for feature in feautues:
    winsorize(X_train, feature)

# Display the dataset after Winsorization
print("Dataset after Winsorization:")
print(X_train)


In [None]:
def count_outliers(series):
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    return ((series < (Q1 - 1.5 * IQR)) | (series > (Q3 + 1.5 * IQR))).sum()

# Count outliers for each column
outliers_count = X_train.apply(count_outliers)
print("Outliers Count per Column:")
print(outliers_count)

In [None]:
# Z-Score Normalization
def z_score_normalize(column):
    return (column - column.mean()) / column.std()

normailzed_data_z_score = X_train.apply(z_score_normalize)

print("Z-Score Normalized Data:")
print(normailzed_data_z_score)


In [None]:
# Min-Max Normalization
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

data_min_max = X_train.apply(min_max_normalize)

print("Min-Max Normalized Data:")
print(data_min_max)

In [None]:
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif


X_new = SelectKBest(score_func=f_classif, k=5).fit_transform(data_min_max, y_train)
x_new_test= SelectKBest(score_func=f_classif, k=5).fit_transform(X_test, y_test)

print("Top 5 Features based on ANOVA F-test:")
print(X_new)


#five features selected by ANOVA F-test : height(cm), hemoglobin, HDL, serum creatinine, waist(cm)

In [40]:
bagging_model = Custom_Bagging(DecisionTreeClassifier, n=50)
boosting_model = Custom_Boosting(DecisionTreeClassifier, n=50)
X_new=np.array(X_new)
y_train=np.array(y_train)
X_test=np.array(x_new_test)
y_test=np.array(y_test)
y_val=np.array(y_val)
X_val=np.array(X_val)
bagging_model.fit(X_new, y_train)
boosting_model.fit(X_new, y_train)
bagging_predictions = bagging_model.predict(X_test)
boosting_predictions = boosting_model.predict(X_test)
bagging_accuracy = accuracy_score(y_test, bagging_predictions)
boosting_accuracy = accuracy_score(y_test, boosting_predictions)
print(f"Bagging Model Accuracy: {bagging_accuracy:.4f}")
print(f"Boosting Model Accuracy: {boosting_accuracy:.4f}")
rf_classifier = Custom_RandomForest(n_estimators=100, max_features='sqrt', random_state=42)
rf_classifier.fit(X_new, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")


TypeError: Cannot cast array data from dtype('float64') to dtype('int64') according to the rule 'safe'