In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import statsmodels.api as sm


In [9]:
pd.set_option('display.max_columns', None)

In [3]:
numerical = pd.read_csv('numerical.csv')
numerical.shape

(95412, 315)

In [5]:
dtypes_dict = numerical.dtypes.to_dict()
for column, dtype in dtypes_dict.items():
    print(f'{column}: {dtype}')
#I don't think we have to change any data type

TCODE: int64
AGE: float64
INCOME: int64
WEALTH1: int64
HIT: int64
MALEMILI: int64
MALEVET: int64
VIETVETS: int64
WWIIVETS: int64
LOCALGOV: int64
STATEGOV: int64
FEDGOV: int64
WEALTH2: int64
POP901: int64
POP902: int64
POP903: int64
POP90C1: int64
POP90C2: int64
POP90C3: int64
POP90C4: int64
POP90C5: int64
ETH1: int64
ETH2: int64
ETH3: int64
ETH4: int64
ETH5: int64
ETH6: int64
ETH7: int64
ETH8: int64
ETH9: int64
ETH10: int64
ETH11: int64
ETH12: int64
ETH13: int64
ETH14: int64
ETH15: int64
ETH16: int64
AGE901: int64
AGE902: int64
AGE903: int64
AGE904: int64
AGE905: int64
AGE906: int64
AGE907: int64
CHIL1: int64
CHIL2: int64
CHIL3: int64
AGEC1: int64
AGEC2: int64
AGEC3: int64
AGEC4: int64
AGEC5: int64
AGEC6: int64
AGEC7: int64
CHILC1: int64
CHILC2: int64
CHILC3: int64
CHILC4: int64
CHILC5: int64
HHAGE1: int64
HHAGE2: int64
HHAGE3: int64
HHN1: int64
HHN2: int64
HHN3: int64
HHN4: int64
HHN5: int64
HHN6: int64
MARR1: int64
MARR2: int64
MARR3: int64
MARR4: int64
HHP1: int64
HHP2: int64
DW1: i

In [6]:
categorical = pd.read_csv('categorical.csv')
categorical.shape

(95412, 22)

In [7]:
categorical.dtypes
#also here nothing to be changed

STATE           object
CLUSTER          int64
HOMEOWNR        object
GENDER          object
DATASRCE         int64
RFA_2R          object
RFA_2A          object
GEOCODE2        object
DOMAIN_A        object
DOMAIN_B         int64
ODATEW_YR        int64
ODATEW_MM        int64
DOB_YR           int64
DOB_MM           int64
MINRDATE_YR      int64
MINRDATE_MM      int64
MAXRDATE_YR      int64
MAXRDATE_MM      int64
LASTDATE_YR      int64
LASTDATE_MM      int64
FIRSTDATE_YR     int64
FIRSTDATE_MM     int64
dtype: object

In [15]:
targets = pd.read_csv('target.csv')
targets.head()

Unnamed: 0,TARGET_B,TARGET_D
0,0,0.0
1,0,0.0
2,0,0.0
3,0,0.0
4,0,0.0


In [16]:
all_data = pd.concat((numerical,categorical, targets),axis=1)
all_data.shape

(95412, 339)

In [18]:
X = all_data.drop(columns=['TARGET_B','TARGET_D'])
y = all_data['TARGET_B']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

train_num = X_train[numerical_cols]
train_cat = X_train[categorical_cols]
test_num = X_test[numerical_cols]
test_cat = X_test[categorical_cols]

In [23]:
# Choose scaler
scaler = StandardScaler()  # Or use MinMaxScaler()

# Choose encoder
encoder = OneHotEncoder(drop='first', sparse=False)  # Or use OrdinalEncoder()

# Scale and encode
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_cols),
        ('cat', encoder, categorical_cols)
    ]
)

# Fit on training data and transform both training and test data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Convert back to DataFrame to combine numerical and categorical parts
X_train = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())




In [24]:
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.95


In [27]:
from collections import Counter
class_counts = Counter(y_train)
print(f'Class distribution in the training set: {class_counts}')

Class distribution in the training set: Counter({0: 72464, 1: 3865})


In [29]:
from sklearn.utils import resample
train_data = pd.concat([X_train, y_train.reset_index(drop=True)], axis=1)
majority_class = train_data[train_data['TARGET_B'] == 0]
minority_class = train_data[train_data['TARGET_B'] == 1]

In [31]:
#Upsample
minority_upsampled = resample(minority_class,
                              replace=True,  # Sample with replacement
                              n_samples=len(majority_class),  # Match number of majority class
                              random_state=42)

upsampled_data = pd.concat([majority_class, minority_upsampled])

In [33]:
# Separate features and target
X_train_upsampled = upsampled_data.drop('TARGET_B', axis=1)
y_train_upsampled = upsampled_data['TARGET_B']

In [34]:
#Downsample
majority_downsampled = resample(majority_class,
                                replace=False,  # Sample without replacement
                                n_samples=len(minority_class),  # Match number of minority class
                                random_state=42)
downsampled_data = pd.concat([minority_class, majority_downsampled])

In [35]:
X_train_downsampled = downsampled_data.drop('TARGET_B', axis=1)
y_train_downsampled = downsampled_data['TARGET_B']

In [36]:
def fit_and_evaluate(X_train, y_train, X_test, y_test):
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy:.2f}')
    return accuracy

In [37]:
# Original data
print("Original data:")
fit_and_evaluate(X_train, y_train, X_test, y_test)

# Upsampled data
print("Upsampled data:")
fit_and_evaluate(X_train_upsampled, y_train_upsampled, X_test, y_test)

# Downsampled data
print("Downsampled data:")
fit_and_evaluate(X_train_downsampled, y_train_downsampled, X_test, y_test)

Original data:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.95
Upsampled data:
Accuracy: 0.61
Downsampled data:
Accuracy: 0.58


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.5771629198763297