In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier


In [2]:
# Load datasets from a folder
def load_and_print_datasets_from_folder(folder_path):
    datasets = {}
    for filename in os.listdir(folder_path):
        if filename.endswith('.csv'):
            file_path = os.path.join(folder_path, filename)
            dataset_name = os.path.splitext(filename)[0]
            df = pd.read_csv(file_path)
            datasets[dataset_name] = df
            # Print the DataFrame
            print(f"\nDataset: {dataset_name}")
            print(df.head())  # Print first few rows of the DataFrame
            print(df.info())  # Print DataFrame summary
    return datasets

In [3]:
def create_pipeline(categorical_features, numerical_features):
    # Preprocessing for numerical data
    numerical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])

    # Preprocessing for categorical data
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine preprocessing steps
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', categorical_transformer, categorical_features)
        ]
    )

    # Create the full pipeline with preprocessing and Random Forest classifier
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ])
    
    return pipeline

In [5]:
def train_and_evaluate(pipeline, df):
    # Check if 'churn' column exists
    if 'churn' not in df.columns:
        raise ValueError("'churn' column not found in the dataset")
    
    # Split data into features and target
    X = df.drop(columns=['churn'])
    y = df['churn']
    
    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # Fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # Make predictions
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, pipeline.predict_proba(X_test)[:, 1])
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'auc_roc': auc_roc
    }

# Initialize the results list
results_list = []

# Load datasets
folder_path = r'E:\TASK2\churn_data'  # Update this path to your folder
datasets = load_and_print_datasets_from_folder(folder_path)

for dataset_name, df in datasets.items():
    print(f"\nProcessing dataset: {dataset_name}")
    
    # Print columns to debug missing column issues
    # print("Columns in the dataset:", df.columns.tolist())

    try:
        # Define features for pipeline; you may need to adapt these based on your dataset
        categorical_features = [col for col in df.columns if df[col].dtype == 'object' and col != 'churn']
        numerical_features = [col for col in df.columns if df[col].dtype in ['int64', 'float64'] and col != 'churn']
        
        # Create the pipeline
        pipeline = create_pipeline(categorical_features, numerical_features)
        
        # Train and evaluate
        evaluation_results = train_and_evaluate(pipeline, df)
        
        # Collect results
        results_list.append({
            'dataset_name': dataset_name,
            'accuracy': evaluation_results['accuracy'],
            'precision': evaluation_results['precision'],
            'recall': evaluation_results['recall'],
            'auc_roc': evaluation_results['auc_roc']
        })
        
        print(f"Evaluation results for {dataset_name}:")
        print(evaluation_results)
    except ValueError as e:
        print(f"Error for dataset {dataset_name}: {e}")
    except Exception as e:
        print(f"Unexpected error for dataset {dataset_name}: {e}")

# Convert results list to DataFrame
results_df = pd.DataFrame(results_list)




Dataset: BankChurners
   clientnum  churn  customer_age gender  dependent_count education_level  \
0  768805383      0            45      M                3     High School   
1  818770008      0            49      F                5        Graduate   
2  713982108      0            51      M                3        Graduate   
3  769911858      0            40      F                4     High School   
4  709106358      0            40      M                3      Uneducated   

  marital_status income_category card_category  months_on_book  ...  \
0        Married     $60K - $80K          Blue              39  ...   
1         Single  Less than $40K          Blue              44  ...   
2        Married    $80K - $120K          Blue              36  ...   
3        Unknown  Less than $40K          Blue              34  ...   
4        Married     $60K - $80K          Blue              21  ...   

   credit_limit  total_revolving_bal  avg_open_to_buy  total_amt_chng_q4_q1  \
0       

In [None]:
# Print the results DataFrame
print("\nAll Evaluation Results:")
print(results_df)


In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Set the path for data and results
data_dir = 'churn_data'
results_dir = 'eda_results'
os.makedirs(results_dir, exist_ok=True)

def perform_eda(df, dataset_name):
    # Summary statistics
    summary_stats = df.describe(include='all').transpose()
    summary_stats.to_csv(os.path.join(results_dir, f'{dataset_name}_summary_stats.csv'))

    # Data types
    data_types = df.dtypes
    data_types.to_csv(os.path.join(results_dir, f'{dataset_name}_data_types.csv'))

    # Missing values
    missing_values = df.isnull().sum()
    missing_values.to_csv(os.path.join(results_dir, f'{dataset_name}_missing_values.csv'))

    # Distribution of target variable
    if 'churn' in df.columns:
        churn_dist = df['churn'].value_counts(normalize=True)
        churn_dist.to_csv(os.path.join(results_dir, f'{dataset_name}_churn_distribution.csv'))

    # Categorical features
    categorical_features = df.select_dtypes(include=['object']).columns
    for cat_feature in categorical_features:
        if cat_feature != 'churn':
            cat_dist = df[cat_feature].value_counts()
            cat_dist.to_csv(os.path.join(results_dir, f'{dataset_name}_{cat_feature}_distribution.csv'))

    # Numerical features
    numerical_features = df.select_dtypes(include=['int64', 'float64']).columns
    for num_feature in numerical_features:
        plt.figure(figsize=(10, 6))
        sns.histplot(df[num_feature], kde=True)
        plt.title(f'Distribution of {num_feature}')
        plt.xlabel(num_feature)
        plt.ylabel('Frequency')
        plt.savefig(os.path.join(results_dir, f'{dataset_name}_{num_feature}_distribution.png'))
        plt.close()

        plt.figure(figsize=(10, 6))
        sns.boxplot(x=df[num_feature])
        plt.title(f'Box plot of {num_feature}')
        plt.xlabel(num_feature)
        plt.savefig(os.path.join(results_dir, f'{dataset_name}_{num_feature}_boxplot.png'))
        plt.close()

    # Correlation analysis
    correlation_matrix = df[numerical_features].corr()
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.savefig(os.path.join(results_dir, f'{dataset_name}_correlation_matrix.png'))
    plt.close()

# Load datasets and perform EDA
for file_name in os.listdir(data_dir):
    if file_name.endswith('.csv'):
        dataset_name = os.path.splitext(file_name)[0]
        print(f"Processing dataset: {dataset_name}")
        file_path = os.path.join(data_dir, file_name)
        df = pd.read_csv(file_path)
        perform_eda(df, dataset_name)
