In [2]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data preprocessing
X_train = train.drop(columns=['smoking'])  # Exclude the target variable
y_train = train['smoking']

X_test = test  # No need to drop 'smoking' column for test data

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training using Logistic Regression
logistic_model = LogisticRegression()
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation to evaluate the model
y_pred_train = cross_val_predict(logistic_model, X_train_scaled, y_train, cv=cv)  # Predict on training data
accuracy_train = accuracy_score(y_train, y_pred_train)
print("Classification Report on Training Data:")
print(f"Accuracy on Training Data: {accuracy_train}")

# Train the final model on the entire training dataset
logistic_model.fit(X_train_scaled, y_train)

# Predict smoking probabilities on the test data
y_pred_proba_test = logistic_model.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1

# Save predictions to a CSV file for test data
submission = test[['id']].copy()
submission['smoking'] = y_pred_proba_test
submission.to_csv('Yaga\'s_submission.csv', index=False)


Classification Report on Training Data:
Accuracy on Training Data: 0.748587180388808


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data preprocessing
X_train = train.drop(columns=['smoking'])  # Exclude the target variable
y_train = train['smoking']

X_test = test.drop(columns=['smoking'], errors='ignore')  # In case 'smoking' column is present

# Handling missing values
imputer = SimpleImputer(strategy='mean')  # or median, most_frequent
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Model training using Random Forest Classifier
random_forest_model = RandomForestClassifier(random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation to evaluate the model
y_pred_train = cross_val_predict(random_forest_model, X_train_scaled, y_train, cv=cv)  # Predict on training data
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on Training Data: {accuracy_train}")

# Train the final model on the entire training dataset
random_forest_model.fit(X_train_scaled, y_train)

# Predict smoking probabilities on the test data
y_pred_proba_test = random_forest_model.predict_proba(X_test_scaled)[:, 1]  # Probability of class 1

# Save predictions to a CSV file for test data
submission = pd.DataFrame({'id': test['id'], 'smoking': y_pred_proba_test})
submission.to_csv('Yagas_submission.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_predict, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Data preprocessing
X_train = train.drop(columns=['smoking'])  # Exclude the target variable
y_train = train['smoking']

X_test = test.drop(columns=['smoking'], errors='ignore')  # In case 'smoking' column is present

# Handling missing values
imputer = SimpleImputer(strategy='mean')  # or median, most_frequent
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Advanced Feature Engineering: Adding polynomial features
poly = PolynomialFeatures(degree=2, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

# Model training using Random Forest Classifier with Hyperparameter Tuning
random_forest_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=cv, n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train_poly, y_train)
best_rf_model = grid_search.best_estimator_

# Evaluate the best model
y_pred_train = cross_val_predict(best_rf_model, X_train_poly, y_train, cv=cv)
accuracy_train = accuracy_score(y_train, y_pred_train)

print(f"Best Model Accuracy on Training Data: {accuracy_train}")

# Train the final model on the entire training dataset
best_rf_model.fit(X_train_poly, y_train)

# Predict smoking probabilities on the test data
y_pred_proba_test = best_rf_model.predict_proba(X_test_poly)[:, 1]  # Probability of class 1

# Save predictions to a CSV file for test data
submission = pd.DataFrame({'id': test['id'], 'smoking': y_pred_proba_test})
submission.to_csv('Yagas_submission.csv', index=False)


In [None]:
# Takes a lot of time  !!



import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report

# Load the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# Basic Feature Engineering
# Example: Creating a new feature like BMI
train['BMI'] = train['weight(kg)'] / (train['height(cm)']/100)**2
test['BMI'] = test['weight(kg)'] / (test['height(cm)']/100)**2

# Data preprocessing
X = train.drop(columns=['smoking'])  # Exclude the target variable
y = train['smoking']
X_test = test

# Scaling features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Splitting the data for model evaluation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

# Model Selection - Random Forest Classifier
rf = RandomForestClassifier(random_state=42)

# Hyperparameter Grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search with Cross-Validation
cv = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best model
best_rf_model = grid_search.best_estimator_
print("Best Random Forest Model:", best_rf_model)

# Evaluate on validation data
y_val_pred = best_rf_model.predict(X_val)
accuracy_val = accuracy_score(y_val, y_val_pred)
roc_auc_val = roc_auc_score(y_val, y_val_pred)
print("Validation Data Evaluation:")
print(f"Accuracy: {accuracy_val}")
print(f"ROC AUC: {roc_auc_val}")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))

# Predict probabilities on the test data
y_pred_proba_test = best_rf_model.predict_proba(X_test_scaled)[:, 1]

# Submission file
submission = test[['id']].copy()
submission['smoking'] = y_pred_proba_test
submission.to_csv('submission.csv', index=False)
