In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Load your dataset
# Replace 'Titanic_train.csv' with your actual dataset
data = pd.read_csv('Titanic_train.csv')
data = pd.read_csv('Titanic_test.csv')# Check if 'target' column exists
if 'target' not in data.columns:
    print("Error: 'target' column not found in the dataset.")
    print("Available columns:", data.columns)
else:
    # Check for multicollinearity using Variance Inflation Factor (VIF)
    X = data.drop('target', axis=1)  # Drop the target column for input features
    y = data['target']  # Target variable

    vif_data = pd.DataFrame()
    vif_data["Feature"] = X.columns
    vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print("\nVariance Inflation Factor (VIF):\n", vif_data)

    # Drop highly correlated features (if VIF > 5)
    X = X.drop(columns=vif_data[vif_data["VIF"] > 5]["Feature"].tolist())
# Exploratory Data Analysis (EDA)
print("Dataset head:\n", data.head())
print("\nDataset info:\n")
data.info()
print("\nSummary statistics:\n", data.describe())

# Check for missing values
print("\nMissing values:\n", data.isnull().sum())

# Data Preprocessing
# Handle missing values (if any)
# Separate numeric and non-numeric columns
numeric_columns = data.select_dtypes(include=['number']).columns  # Select numeric columns
non_numeric_columns = data.select_dtypes(exclude=['number']).columns  # Select non-numeric columns

# Fill missing values for numeric columns with mean
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].mean())

# Fill missing values for non-numeric columns with the most frequent value (mode)
for column in non_numeric_columns:
    data[column].fillna(data[column].mode()[0], inplace=True)

# Outlier treatment using Interquartile Range (IQR)
Q1 = data[numeric_columns].quantile(0.25)
Q3 = data[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
# Only apply outlier treatment to numeric columns
data = data[~((data[numeric_columns] < (Q1 - 1.5 * IQR)) | (data[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)]

# Check for multicollinearity using Variance Inflation Factor (VIF)
# Replace 'target' with your actual dependent variable name
X = data.drop('target', axis=1)  # Drop the target column for input features
y = data['target']  # Target variable
vif_data = pd.DataFrame()
vif_data["Feature"] = X.columns
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print("\nVariance Inflation Factor (VIF):\n", vif_data)

# Drop highly correlated features (if VIF > 5)
X = X.drop(columns=vif_data[vif_data["VIF"] > 5]["Feature"].tolist())

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# ROC-AUC Curve
y_pred_proba = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.plot(fpr, tpr, label="ROC Curve")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.show()
print("\nROC-AUC Score:", roc_auc_score(y_test, y_pred_proba))

# Interpretation of Coefficients
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_[0]})
print("\nLogistic Regression Coefficients:\n", coefficients)

# Save the model using pickle for deployment
import pickle
with open('logistic_model.pkl', 'wb') as file:
    pickle.dump(model, file)

# Streamlit App for Deployment
# Save this code as `app.py`
streamlit_code = """
import streamlit as st
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the model
model = pickle.load(open('logistic_model.pkl', 'rb'))

# Streamlit App
st.title('Logistic Regression Prediction App')

# User input
def user_input_features():
    feature1 = st.number_input('Feature 1')
    feature2 = st.number_input('Feature 2')
    # Add more input fields as per your dataset
    data = {'Feature1': feature1, 'Feature2': feature2}
    features = pd.DataFrame(data, index=[0])
    return features

input_df = user_input_features()

# Prediction
if st.button('Predict'):
    scaler = StandardScaler()
    input_scaled = scaler.fit_transform(input_df)
    prediction = model.predict(input_scaled)
    st.write('Prediction:', 'Positive' if prediction[0] == 1 else 'Negative')
"""

with open('app.py', 'w') as file:
    file.write(streamlit_code)

print("\nStreamlit app saved as 'app.py'. Run it using `streamlit run app.py`.")

Error: 'target' column not found in the dataset.
Available columns: Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')
Dataset head:
    PassengerId  Pclass                                          Name     Sex  \
0          892       3                              Kelly, Mr. James    male   
1          893       3              Wilkes, Mrs. James (Ellen Needs)  female   
2          894       2                     Myles, Mr. Thomas Francis    male   
3          895       3                              Wirz, Mr. Albert    male   
4          896       3  Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female   

    Age  SibSp  Parch   Ticket     Fare Cabin Embarked  
0  34.5      0      0   330911   7.8292   NaN        Q  
1  47.0      1      0   363272   7.0000   NaN        S  
2  62.0      0      0   240276   9.6875   NaN        Q  
3  27.0      0      0   315154   8.6625   NaN        S  
4  22.0   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[column].fillna(data[column].mode()[0], inplace=True)


KeyError: "['target'] not found in axis"