In [28]:
from scipy.io import arff
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load the ARFF file
data, meta = arff.loadarff('C:/Users/ADMIN/Downloads/dataset_31_credit-g.arff')

# Convert to pandas DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the data
df.head()

# Step 1: Check the data types of the columns
print(df.dtypes)

# Step 2: Handle any categorical columns:
label_encoder = LabelEncoder()

# Encode the target column 'class'
df['class'] = label_encoder.fit_transform(df['class'])

# Step 3: Check for any missing data and handle it
print(df.isnull().sum())  # Shows how many missing values in each column

# Step 4: Handle non-numeric columns
# Identify columns that are non-numeric (they have object dtype)
non_numeric_columns = df.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_columns)

# Convert all non-numeric columns to categorical if appropriate
for column in non_numeric_columns:
    df[column] = df[column].astype(str).str.replace(b'<0', '0')  # Example fix for invalid entries like b'<0'
    df[column] = label_encoder.fit_transform(df[column])  # Encode non-numeric columns

# Step 5: Fill missing values with the median for numeric columns only
df = df.apply(pd.to_numeric, errors='coerce')  # Ensure all columns are numeric
df = df.fillna(df.median())  # Fill missing values with the median (for numerical columns)

# Step 6: Split the dataset into features and target
X = df.drop('class', axis=1)  # Features
y = df['class']               # Target

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Now, you can proceed with training a model (RandomForest, etc.)


checking_status            object
duration                  float64
credit_history             object
purpose                    object
credit_amount             float64
savings_status             object
employment                 object
installment_commitment    float64
personal_status            object
other_parties              object
residence_since           float64
property_magnitude         object
age                       float64
other_payment_plans        object
housing                    object
existing_credits          float64
job                        object
num_dependents            float64
own_telephone              object
foreign_worker             object
class                      object
dtype: object
checking_status           0
duration                  0
credit_history            0
purpose                   0
credit_amount             0
savings_status            0
employment                0
installment_commitment    0
personal_status           0
other_parties       