# Modeling

### import libraries 

In [1]:
# Import pandas with alias pd
import pandas as pd

# Import train_test_split for data splitting
from sklearn.model_selection import train_test_split, GridSearchCV

# Import Logistic Regression model
from sklearn.neighbors import KNeighborsClassifier

# Import evaluation metrics for model performance
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


### read data

In [2]:
# Read the CSV file into a DataFrame using pandas
data = pd.read_csv(r"C:\Users\moham\OneDrive\Documents\my track\Internships\Cellula\Modeling\data.csv")


In [3]:
data.select_dtypes('object').columns

Index([], dtype='object')

### spliting data to ( X_train , X_test , Y_train , Y_test )

In [4]:
# Extract features (X) by dropping the 'output' column
X = data.drop('output', axis=1)

# Extract target variable (Y) from the 'output' column
Y = data['output']


In [5]:
X_train , X_test , Y_train , Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=42)

### The model


In [6]:
# Create a Logistic Regression model
model = KNeighborsClassifier()

In [7]:
# Train K-Nearest Neighbors with Grid Search

# Define hyperparameter grid for Grid Search
param_grid_knn = {'n_neighbors': [3, 5, 7, 10], 'weights': ['uniform', 'distance'], 'p': [1, 2]}

# Set up Grid Search for K-Nearest Neighbors
grid_search_knn = GridSearchCV(estimator=model, param_grid=param_grid_knn, scoring='accuracy', cv=5)

# Fit Grid Search to training data
grid_search_knn.fit(X_train, Y_train)


### Model Evaluation 

In [9]:
# Make predictions on the training set using the trained model
X_train_prediction = grid_search_knn.predict(X_train)

# Calculate and print the accuracy on the training data
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print(f'Training Data Accuracy: {training_data_accuracy:.2%}')

Training Data Accuracy: 99.34%


In [8]:
# Print the best parameters
print("Best Parameters:", grid_search_knn.best_params_)

# Make predictions using the best model
Y_test_pred = grid_search_knn.predict(X_test)

Best Parameters: {'n_neighbors': 10, 'p': 1, 'weights': 'distance'}


In [10]:
# Evaluate the model performance on the test set
test_accuracy = accuracy_score(Y_test_pred, Y_test)
print(f'Test Data Accuracy: {test_accuracy:.2%}')

# Print the classification report
print('Classification Report:\n', classification_report(Y_test_pred, Y_test))

# Print the confusion matrix
print('Confusion Matrix:\n', confusion_matrix(Y_test_pred, Y_test))

Test Data Accuracy: 87.11%
Classification Report:
               precision    recall  f1-score   support

         0.0       0.75      0.80      0.77      1556
         1.0       0.92      0.90      0.91      4129

    accuracy                           0.87      5685
   macro avg       0.83      0.85      0.84      5685
weighted avg       0.87      0.87      0.87      5685

Confusion Matrix:
 [[1244  312]
 [ 421 3708]]
