In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


plt.rcParams["figure.figsize"] = (8, 5)

In [None]:
df = pd.read_csv("../data/visa_data.csv", encoding="latin1")
df.head()

In [None]:
print(df.head())
print(df.info())
print(df.isnull().sum())


In [None]:
def duration_to_months(value):
    if pd.isna(value):
        return np.nan
    
    value = str(value).lower()
    
    if "year" in value:
        return float(value.split()[0]) * 12
    elif "month" in value:
        return float(value.split()[0])
    elif "day" in value:
        return float(value.split()[0]) / 30
    else:
        return np.nan

df["intended_duration_months"] = df["intended_duration"].apply(duration_to_months)

In [None]:
df["age"] = pd.to_numeric(df["age"], errors="coerce")

In [None]:
plt.hist(df["age"].dropna(), bins=20)
plt.title("Age Distribution of Applicants")
plt.xlabel("Age")
plt.ylabel("Count")
plt.show()


In [None]:
plt.hist(df["work_experience_years"].dropna(), bins=20)
plt.title("Work Experience Distribution")
plt.xlabel("Years of Experience")
plt.ylabel("Count")
plt.show()

In [None]:
df["visa_category_(label)"].value_counts().plot(kind="bar")
plt.title("Visa Category Distribution")
plt.xlabel("Visa Category")
plt.ylabel("Count")
plt.show()

In [None]:
df["target_country"].value_counts().head(10).plot(kind="bar")
plt.title("Top Target Countries")
plt.xlabel("Country")
plt.ylabel("Number of Applications")
plt.show()


In [None]:
df.boxplot(column="age", by="visa_category_(label)")
plt.title("Age vs Visa Category")
plt.suptitle("")
plt.xlabel("Visa Category")
plt.ylabel("Age")
plt.show()

In [None]:
df.boxplot(column="document_completeness_score", by="visa_category_(label)")
plt.title("Document Completeness vs Visa Category")
plt.suptitle("")
plt.ylabel("Completeness Score")
plt.show()

In [None]:
numeric_cols = [
    "age",
    "work_experience_years",
    "previous_visa_rejections",
    "document_completeness_score",
    "intended_duration"
]

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [None]:
country_avg_experience = (
    df.groupby("target_country", as_index=False)["work_experience_years"]
      .mean()
      .rename(columns={"work_experience_years": "avg_experience_by_country"})
)

In [None]:
df = df.merge(
    country_avg_experience,
    on="target_country",
    how="left"
)

In [None]:
df["risk_score"] = (
    df["previous_visa_rejections"].fillna(0)
    + (1 - df["document_completeness_score"].fillna(0))
)

In [None]:
df.fillna({
    "age": df["age"].median(),
    "work_experience_years": df["work_experience_years"].median(),
    "avg_experience_by_country": df["avg_experience_by_country"].median(),
    "intended_duration": df["intended_duration"].median()
}, inplace=True)

In [None]:
print(df.columns)