In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ak0212/marriage-trends-in-india-love-vs-arranged")

print("Path to dataset files:", path)

In [None]:
!jupyter nbconvert --ClearOutputPreprocessor.enabled=True --inplace marrage.ipynb


In [None]:
import pandas as pd

# Load the dataset
file_path = "marriage_data_india.csv"  # Update with the correct path if needed
df = pd.read_csv(file_path)

# Display basic information
print(df.info())



In [None]:
# Display first few rows
df.head()

In [None]:
df.describe()

In [None]:
df.describe(include='object')

In [None]:
df.duplicated().sum()

In [None]:
df.columns = df.columns.str.lower()
df.columns

In [None]:
from datetime import datetime

# Get the current year
current_year = datetime.now().year

# Calculate the Year of Marriage
df['year_of_marriage'] = current_year - df['years_since_marriage']

# Verify the new column
df.head()

In [None]:
# split data in num and cat
num = df.select_dtypes('number')
cat = df.select_dtypes('object')

In [None]:
num.head()

In [None]:
cat.head()

In [None]:
# Loop through each categorical column
for col in cat:
    print(f" Value Counts for: {col}\n")
    print(df[col].value_counts(), "\n")
    
    # Plot bar chart using Plotly
    fig = px.bar(df[col].value_counts(), 
                 x=df[col].value_counts().index, 
                 y=df[col].value_counts().values, 
                 title=f"Distribution of {col}",
                 labels={"x": col, "y": "Count"})
    fig.show()

In [None]:
# Calculate the percentage split per Year_of_Marriage
yearly_counts = df.groupby(['year_of_marriage','marriage_type']).size().unstack(fill_value=0)
yearly_percent = yearly_counts.div(yearly_counts.sum(axis=1),axis=0) * 100 # Convert to %
yearly_percent

In [None]:
# Count occurrences of each marriage type per gender
gender_count = df.groupby(['gender', 'marriage_type']).size().unstack(fill_value=0)

# Convert to percentage (normalize by gender count)
gender_percent = gender_count.div(gender_count.sum(axis=1), axis=0) * 100  

# Display percentage distribution
print(gender_percent)


In [None]:
# Count occurrences of each religion type per gender
religion_count = df.groupby(['gender', 'religion']).size().unstack(fill_value=0)

# Convert to percentage (normalize by gender count)
religion_percent = religion_count.div(gender_count.sum(axis=1), axis=0) * 100  

# Display percentage distribution
print(religion_percent)


In [None]:
# Count occurrences of each divorce_status type per gender
divorce_status_count = df.groupby(['gender', 'divorce_status']).size().unstack(fill_value=0)

# Convert to percentage (normalize by gender count)
divorce_status_percent = divorce_status_count.div(gender_count.sum(axis=1), axis=0) * 100  

# Display percentage distribution
print(divorce_status_percent)


In [None]:
# Count occurrences of each income_level type per gender
income_level_count = df.groupby(['gender', 'income_level']).size().unstack(fill_value=0)

# Convert to percentage (normalize by gender count)
income_level_percent = income_level_count.div(gender_count.sum(axis=1), axis=0) * 100  

# Display percentage distribution
print(income_level_percent)


In [None]:
# Count occurrences of each marriage type per gender
income_level_count = df.groupby(['gender', 'marital_satisfaction']).size().unstack(fill_value=0)

# Convert to percentage (normalize by gender count)
income_level_percent = income_level_count.div(gender_count.sum(axis=1), axis=0) * 100  

# Display percentage distribution
print(income_level_percent)


# Visualise 

In [None]:
import plotly.express as px
df.columns

In [None]:
fig = px.histogram(df, x="marriage_type", nbins=30, title="Distribution of marriage_type")
fig.show()


In [None]:
fig = px.scatter(df, x="age_at_marriage", y="income_level", color="marriage_type",
                 title="Marriage Age vs. Income by Marriage Type")
fig.show()


In [None]:
fig = px.box(df, x="religion", y="age_at_marriage", title="Marriage Age Distribution by Region")
fig.show()


In [None]:
import plotly.figure_factory as ff
import numpy as np

correlation = num.corr()
fig = ff.create_annotated_heatmap(
    z=correlation.to_numpy(), 
    x=list(correlation.columns),
    y=list(correlation.index),
    colorscale="Viridis")

fig.update_layout(
    font=dict(
        family="Arial",  # You can change the font family
        size=7,         # Adjust the font size (e.g., 14, 16, 18, etc.)
        color="black"    # Change the font color if needed
    )
)

fig.update_layout(width=700, height=500)  # Adjust values as needed

fig.show()


In [None]:
fig = px.pie(df, names="marriage_type", title="Proportion of Marriage Types", hole=0.4)
fig.show()


In [None]:
fig = px.box(df, x="gender", y="age_at_marriage", color="gender",
             title="Marriage Age Distribution by Gender")
fig.show()


In [None]:
fig = px.violin(df, x="education_level", y="age_at_marriage", color="education_level",
                title="Marriage Age Distribution by Education Level",
                box=True, points="all")
fig.show()


In [None]:
fig = px.histogram(df, x="income_level", nbins=40, color="marriage_type",
                   title="Income Distribution by Marriage Type")
fig.show()


In [None]:
fig = px.histogram(df, x="divorce_status", nbins=40, color="marriage_type",
                   title="Income Distribution by Marriage Type")
fig.show()


In [None]:
fig = px.histogram(df, x="inter-religion", nbins=40, color="marriage_type",
                   title="Income Distribution by Marriage Type")
fig.show()


In [None]:
fig = px.histogram(df, x="inter-caste", nbins=40, color="marriage_type",
                   title="Income Distribution by Marriage Type")
fig.show()


In [None]:
fig = px.sunburst(df, path=["education_level", "marriage_type"], title="Marriage Type by Region")
fig.show()


# ML 

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Encode cat 
for col in cat:
    le = LabelEncoder()
    df[col]= le.fit_transform(df[col])
df.head()

In [None]:
# Split features & target
X = df.drop(columns=["divorce_status"])  # Features
y = df["divorce_status"]  # Target


In [None]:
 #Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100),
    "SVM": SVC(kernel="linear"),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Decision Tree": DecisionTreeClassifier(random_state=42)
}

In [None]:
# Train and evaluate each model using pipelines
for name, model in models.items():
    pipeline = ImbPipeline(steps=[
        ("scaler", StandardScaler()),
        ("classifier", model)
    ])

    # Train model
    pipeline.fit(X_train, y_train)
    
    # Predict on test set
    y_pred = pipeline.predict(X_test)
    
    # Print performance metrics
    print(f"\n🔹 {name} Model Performance:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(classification_report(y_test, y_pred, zero_division=1))  # Set to 1 or 0 as needed


In [None]:
# Get feature importance
importance_df = pd.DataFrame({
    "Feature": X.columns,
    "Importance": rf_model.feature_importances_
}).sort_values(by="Importance", ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=importance_df["Importance"], y=importance_df["Feature"], palette="viridis")
plt.xlabel("Feature Importance Score")
plt.ylabel("Features")
plt.title("Feature Importance - Divorce Prediction")
plt.show()
