# Data Leakage - # Splitting Dataset

In [143]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder


df = pd.read_csv('/Users/pedrojosetrujillomejia/Desktop/test.csv')


df['satisfaction'] = pd.Categorical(df['satisfaction'], categories=['dissatisfied', 'neutral or dissatisfied', 'satisfied'])
df['satisfaction'] = df['satisfaction'].cat.codes


target = 'satisfaction'


correlated_features = ['Age', 'Flight Distance']


categorical_cols = ['Gender', 'Class', 'Type of Travel', 'Customer Type']


encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cols = encoder.fit_transform(df[categorical_cols])
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), pd.DataFrame(encoded_cols.toarray())], axis=1)


train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=42)


imputer = SimpleImputer(strategy='most_frequent', add_indicator=True)
X_train = imputer.fit_transform(train_df.drop(target, axis=1))
y_train = train_df[target]


model = LogisticRegression()
model.fit(X_train, y_train)

X_test = test_df.drop(target, axis=1)
y_test = test_df[target]


for feature in correlated_features:
    if X_test[feature].dtype == 'object':
        X_test[feature] = pd.to_numeric(X_test[feature], errors='coerce')
    corr = X_test[feature].astype(float).corr(y_test.astype(float))
    if corr > 0.9:
        print(f"WARNING: Potential data leakage from {feature} into target variable")
    else:
        print(f"No potential data leakage from {feature} into target variable")


X_test_imputed = imputer.transform(X_test)
y_pred = model.predict(X_test_imputed)


y_pred_categorical = pd.Categorical.from_codes(y_pred, categories=['dissatisfied', 'neutral or dissatisfied', 'satisfied'])


score = model.score(X_test_imputed, y_test)
print(f"Model score: {score}")
print("Predicted target values:")
print(y_pred_categorical)


  mode = stats.mode(array)


No potential data leakage from Age into target variable
No potential data leakage from Flight Distance into target variable
Model score: 0.6505003849114703
Predicted target values:
['neutral or dissatisfied', 'satisfied', 'neutral or dissatisfied', 'satisfied', 'satisfied', ..., 'satisfied', 'neutral or dissatisfied', 'satisfied', 'satisfied', 'neutral or dissatisfied']
Length: 5196
Categories (3, object): ['dissatisfied', 'neutral or dissatisfied', 'satisfied']


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Data Contimination

In [144]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline


file_path = '/Users/pedrojosetrujillomejia/Desktop/test.csv'
df = pd.read_csv(file_path)


target = 'satisfaction'



df['satisfaction'] = pd.Categorical(df['satisfaction'], categories=['dissatisfied', 'neutral or dissatisfied', 'satisfied'])
df['satisfaction'] = df['satisfaction'].cat.codes



categorical_cols = ['Gender', 'Class', 'Type of Travel', 'Customer Type']



encoder = OneHotEncoder(handle_unknown='ignore')
encoded_cols = encoder.fit_transform(df[categorical_cols])
df_encoded = pd.concat([df.drop(categorical_cols, axis=1), pd.DataFrame(encoded_cols.toarray())], axis=1)


Q1 = df_encoded.quantile(0.25)
Q3 = df_encoded.quantile(0.75)
IQR = Q3 - Q1
df_filtered = df_encoded[~((df_encoded < (Q1 - 1.5 * IQR)) | (df_encoded > (Q3 + 1.5 * IQR))).any(axis=1)]



train_df, test_df = train_test_split(df_filtered, test_size=0.2, random_state=42)



imputer = SimpleImputer(strategy='most_frequent', add_indicator=True)
X_train = imputer.fit_transform(train_df.drop(target, axis=1))
y_train = train_df[target]


drop_column = set(test_df.columns) - set(train_df.columns)
X_test = test_df.drop(target, axis=1).drop(drop_column, axis=1)
y_test = test_df[target]



scaler = StandardScaler()

# Check number of features in X_train and X_test before scaling
if X_train.shape[1] != X_test.shape[1]:
    print("Warning: Number of features in X_train and X_test are different, scaling may not work properly.")

X_train = scaler.fit_transform(X_train)

# Scale X_test only if the number of features is the same as X_train
if X_train.shape[1] == X_test.shape[1]:
    X_test = scaler.transform(X_test)



correlated_features = ['Inflight wifi service', 'Ease of Online booking', 'On-board service', 'Cleanliness']

for feature in correlated_features:
    if X_test[feature].dtype == 'object':
        X_test[feature] = pd.to_numeric(X_test[feature], errors='coerce')
    corr = X_test[feature].astype(float).corr(y_test.astype(float), method='pearson', min_periods=1)
    if pd.isna(corr):
        print(f"WARNING: Could not calculate correlation for feature {feature}")
    elif corr > 0.9:
        print(f"WARNING: Potential data leakage from {feature} into target variable")
    else:
        print(f"No potential data leakage from {feature} into target variable")


model = LogisticRegression()
model.fit(X_train, y_train)





No potential data leakage from Inflight wifi service into target variable
No potential data leakage from Ease of Online booking into target variable
No potential data leakage from On-board service into target variable
No potential data leakage from Cleanliness into target variable


  mode = stats.mode(array)


LogisticRegression()