In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Load the training data
df = pd.read_csv('train.csv')

# Separate features (X) and target (y)
X = df.drop(columns=['Churn'])
y = df['Churn']

# Use sparse=True to create a sparse matrix and save memory
dummy_clf = pd.get_dummies(X, drop_first=True, sparse=True)

# Initialize the Logistic Regression model
clf = LogisticRegression(random_state=42, max_iter=1000)

# Train the model using the training data
clf.fit(dummy_clf, y)

# Load the test data
test_df = pd.read_csv('test.csv')

# Preprocess the test data by dropping 'CustomerID'
X_test = test_df.drop(columns=['CustomerID'])

# One-hot encode the test data
test_df_dummies = pd.get_dummies(X_test, drop_first=True, sparse=True)

# Align the test data columns with the training data columns
test_df_processed = test_df_dummies.reindex(columns=dummy_clf.columns, fill_value=0)

# Predict probabilities for the test data
test_predictions_proba = clf.predict_proba(test_df_processed)[:, 1]

# Print predicted probabilities
print(test_predictions_proba)




In [3]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_df = pd.read_csv('train.csv')

# Drop 'CustomerID' and separate features (X) and target (y)
X_train = train_df.drop(columns=['Churn', 'CustomerID'])
y_train = train_df['Churn']

# Label encode the categorical variables to save memory
label_encoders = {}
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le

# Initialize and fit the Logistic Regression model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# Load the test data and preprocess it
test_df = pd.read_csv('test.csv')

# Drop 'CustomerID' and encode test data using the same label encoders
X_test = test_df.drop(columns=['CustomerID'])
for col in X_test.select_dtypes(include=['object']).columns:
    X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Predict the probability for the test data
test_predictions_proba = clf.predict_proba(X_test)[:, 1]

# Print the predicted probabilities for class 1 (Churn)
print(test_predictions_proba)

# Ensure the output is 104,480 entries
print(f"Number of predictions: {len(test_predictions_proba)}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[0.10957452 0.03843084 0.36481048 ... 0.09535441 0.23935112 0.01586952]
Number of predictions: 104480


In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder

# Load the training data
train_df = pd.read_csv('train.csv')

# Drop 'CustomerID' and separate features (X) and target (y)
X_train = train_df.drop(columns=['Churn', 'CustomerID'])
y_train = train_df['Churn']

# Label encode the categorical variables in training data
label_encoders = {}
for col in X_train.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    label_encoders[col] = le

# Initialize and fit the Logistic Regression model
clf = LogisticRegression(random_state=42, max_iter=1000)
clf.fit(X_train, y_train)

# Load the test data
test_df = pd.read_csv('test.csv')

# Drop 'CustomerID' and encode test data using the same label encoders
X_test = test_df.drop(columns=['CustomerID'])
for col in X_test.select_dtypes(include=['object']).columns:
    X_test[col] = label_encoders[col].transform(X_test[col].astype(str))

# Predict the probability for the test data
predicted_probability = clf.predict_proba(X_test)[:, 1]

# Combine predictions with the 'CustomerID' into a dataframe
prediction_df = pd.DataFrame({
    'CustomerID': test_df['CustomerID'], 
    'predicted_probability': predicted_probability
})

# Print the resulting DataFrame
print(prediction_df)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


        CustomerID  predicted_probability
0       O1W6BHP6RM               0.109575
1       LFR4X92X8H               0.038431
2       QM5GBIYODA               0.364810
3       D9RXTK2K9F               0.034647
4       ENTCCHR1LR               0.138235
...            ...                    ...
104475  UTKREC613O               0.080177
104476  MDB4E477PS               0.115909
104477  IPDIA02ZE1               0.095354
104478  ITLFTPRJGV               0.239351
104479  Y204GZY6NE               0.015870

[104480 rows x 2 columns]
