In [2]:
import numpy as np
import pandas as pd # used to store and manipulate the data in table form
from datetime import datetime # helps to convert the date strings into the real date objects

In [5]:
data = {
    "Application_Date": [
        "2024-01-01", "2024-02-15", "2024-03-10",
        "2024-04-05", "2024-05-12", "2024-06-01",
        "2024-06-18", "2024-07-02", "2024-07-25",
        "2024-08-10"
    ],
    "Decision_Date": [
        "2024-02-01", "2024-03-20", "2024-04-05",
        "2024-05-01", "2024-06-20", "2024-06-25",
        "2024-07-28", "2024-08-05", "2024-09-15",
        "2024-10-01"
    ],
    "Country": [
        "India", "United States", "United Kingdom",
        "Canada", "Australia", "Germany",
        "India", "France", "Japan", "Brazil"
    ],
    "Visa_Type": [
        "Student", "Tourist", "Work",
        "Tourist", "Student", "Work",
        "Work", "Tourist", "Student", "Work"
    ]
}


In [6]:
df = pd.DataFrame(data)
# converts the dict into a Pandas Dataframe, which behaves like a spreadsheet
print(df) # shows a sample datasets as a excel format

  Application_Date Decision_Date         Country Visa_Type
0       2024-01-01    2024-02-01           India   Student
1       2024-02-15    2024-03-20   United States   Tourist
2       2024-03-10    2024-04-05  United Kingdom      Work
3       2024-04-05    2024-05-01          Canada   Tourist
4       2024-05-12    2024-06-20       Australia   Student
5       2024-06-01    2024-06-25         Germany      Work
6       2024-06-18    2024-07-28           India      Work
7       2024-07-02    2024-08-05          France   Tourist
8       2024-07-25    2024-09-15           Japan   Student
9       2024-08-10    2024-10-01          Brazil      Work


In [8]:
df["Application_Date"] = pd.to_datetime(df["Application_Date"])
df["Decision_Date"] = pd.to_datetime(df["Decision_Date"])

print(df)

  Application_Date Decision_Date         Country Visa_Type
0       2024-01-01    2024-02-01           India   Student
1       2024-02-15    2024-03-20   United States   Tourist
2       2024-03-10    2024-04-05  United Kingdom      Work
3       2024-04-05    2024-05-01          Canada   Tourist
4       2024-05-12    2024-06-20       Australia   Student
5       2024-06-01    2024-06-25         Germany      Work
6       2024-06-18    2024-07-28           India      Work
7       2024-07-02    2024-08-05          France   Tourist
8       2024-07-25    2024-09-15           Japan   Student
9       2024-08-10    2024-10-01          Brazil      Work


In [12]:
# calculating the processing time

df["Processing_Days"] = (df["Decision_Date"] - df["Application_Date"]).dt.days
# subtracting the dates gives a time difference
#.dt.days-> extracts the dofferenece in days

print(df)

  Application_Date Decision_Date         Country Visa_Type  Processing_Days
0       2024-01-01    2024-02-01           India   Student               31
1       2024-02-15    2024-03-20   United States   Tourist               34
2       2024-03-10    2024-04-05  United Kingdom      Work               26
3       2024-04-05    2024-05-01          Canada   Tourist               26
4       2024-05-12    2024-06-20       Australia   Student               39
5       2024-06-01    2024-06-25         Germany      Work               24
6       2024-06-18    2024-07-28           India      Work               40
7       2024-07-02    2024-08-05          France   Tourist               34
8       2024-07-25    2024-09-15           Japan   Student               52
9       2024-08-10    2024-10-01          Brazil      Work               52


In [13]:
df_encoded = pd.get_dummies(df, columns=["Country", "Visa_Type"])
#pd.get_dummies -> converts the categorical columns inot 0/1 encoded columns
print(df_encoded)

  Application_Date Decision_Date  Processing_Days  Country_Australia  \
0       2024-01-01    2024-02-01               31              False   
1       2024-02-15    2024-03-20               34              False   
2       2024-03-10    2024-04-05               26              False   
3       2024-04-05    2024-05-01               26              False   
4       2024-05-12    2024-06-20               39               True   
5       2024-06-01    2024-06-25               24              False   
6       2024-06-18    2024-07-28               40              False   
7       2024-07-02    2024-08-05               34              False   
8       2024-07-25    2024-09-15               52              False   
9       2024-08-10    2024-10-01               52              False   

   Country_Brazil  Country_Canada  Country_France  Country_Germany  \
0           False           False           False            False   
1           False           False           False            False 

In [14]:
X = df_encoded.drop(columns=["Processing_Days", "Application_Date", "Decision_Date"])
y = df_encoded["Processing_Days"]

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
model = LinearRegression()
model.fit(X, y)

In [19]:
sample_input = {
    "country_India": 1,
    "country_United_States": 0,
    "country_United_Kingdom": 0,
    "country_Canada": 0,
    "country_Australia": 0,
    "country_Germany": 0,
    "country_France": 0,
    "country_Japan": 0,
    "country_Brazil": 0,

    "visa_type_Student": 1,
    "visa_type_Tourist": 0,
    "visa_type_Work": 0
}


In [20]:
import numpy as np
sample_array = np.array(list(sample_input.values())).reshape(1, -1)

predicted_days = model.predict(sample_array)
print("Predicted Processing Time:", predicted_days[0], "days")


Predicted Processing Time: 38.999999999999986 days




In [22]:
## datas with missing values 
data = {
    "Application_Date": [
        "2024-01-01", "2024-02-15", "2024-03-10",
        "2024-04-05", "2024-05-12", "2024-06-01",
        "2024-06-18", "2024-07-02", "2024-07-25",
        "2024-08-10"
    ],
    "Decision_Date": [
        "2024-02-01", "2024-03-20", "2024-04-05",
        "2024-05-01", "2024-06-20", "2024-06-25",
        "2024-07-28", "2024-08-05", "2024-09-15",
        "2024-10-01"
    ],
    "Country": [
        "India", "United States", "United Kingdom",
        "Canada", "Australia", "Germany",
        "India", "France", "Japan", "Brazil"
    ],
    "Visa_Type": [
        "Student", "Tourist", "Work",
        "Tourist", "Student", "Work",
        "Work", "Tourist", "Student", "Work"
    ]
}

In [23]:
df = pd.DataFrame(data)
print("Original DataFrame with Missing Values:\n", df)

Original DataFrame with Missing Values:
   Application_Date Decision_Date         Country Visa_Type
0       2024-01-01    2024-02-01           India   Student
1       2024-02-15    2024-03-20   United States   Tourist
2       2024-03-10    2024-04-05  United Kingdom      Work
3       2024-04-05    2024-05-01          Canada   Tourist
4       2024-05-12    2024-06-20       Australia   Student
5       2024-06-01    2024-06-25         Germany      Work
6       2024-06-18    2024-07-28           India      Work
7       2024-07-02    2024-08-05          France   Tourist
8       2024-07-25    2024-09-15           Japan   Student
9       2024-08-10    2024-10-01          Brazil      Work


In [24]:
miss= df.isnull().sum()
# isnull-> finds the missing values
# sum() -> counts them in colm-wise
print("\nMissing values count:\n", miss)



Missing values count:
 Application_Date    0
Decision_Date       0
Country             0
Visa_Type           0
dtype: int64


In [32]:
df["Application_Date"] = df["Application_Date"].fillna(df["Application_Date"].mode()[0])
df["Decision_Date"] = df["Decision_Date"].fillna(df["Decision_Date"].mode()[0])

df["Country"] = df["Country"].fillna("Unknown")



In [33]:
# Create processing_office if not present
df["processing_office"] = df.get("processing_office", "Unknown")

df["Application_Date"] = pd.to_datetime(df["Application_Date"])
df["Decision_Date"] = pd.to_datetime(df["Decision_Date"])

print(df)


  Application_Date Decision_Date         Country Visa_Type processing_office  \
0       2024-01-01    2024-02-01           India   Student           Unknown   
1       2024-02-15    2024-03-20   United States   Tourist           Unknown   
2       2024-03-10    2024-04-05  United Kingdom      Work           Unknown   
3       2024-04-05    2024-05-01          Canada   Tourist           Unknown   
4       2024-05-12    2024-06-20       Australia   Student           Unknown   
5       2024-06-01    2024-06-25         Germany      Work           Unknown   
6       2024-06-18    2024-07-28           India      Work           Unknown   
7       2024-07-02    2024-08-05          France   Tourist           Unknown   
8       2024-07-25    2024-09-15           Japan   Student           Unknown   
9       2024-08-10    2024-10-01          Brazil      Work           Unknown   

   Processing_Days  
0               31  
1               34  
2               26  
3               26  
4             

In [36]:
df["processing_days"] = (df["Decision_Date"] - df["Application_Date"]).dt.days
print("\nAfter calculating processing days:\n", df)



After calculating processing days:
   Application_Date Decision_Date         Country Visa_Type processing_office  \
0       2024-01-01    2024-02-01           India   Student           Unknown   
1       2024-02-15    2024-03-20   United States   Tourist           Unknown   
2       2024-03-10    2024-04-05  United Kingdom      Work           Unknown   
3       2024-04-05    2024-05-01          Canada   Tourist           Unknown   
4       2024-05-12    2024-06-20       Australia   Student           Unknown   
5       2024-06-01    2024-06-25         Germany      Work           Unknown   
6       2024-06-18    2024-07-28           India      Work           Unknown   
7       2024-07-02    2024-08-05          France   Tourist           Unknown   
8       2024-07-25    2024-09-15           Japan   Student           Unknown   
9       2024-08-10    2024-10-01          Brazil      Work           Unknown   

   Processing_Days  processing_days  
0               31               31  
1     

In [37]:
df_encoded = pd.get_dummies(df, columns=["Country", "Visa_Type", "processing_office"])
print("\nEncoded DataFrame:\n", df_encoded)


Encoded DataFrame:
   Application_Date Decision_Date  Processing_Days  processing_days  \
0       2024-01-01    2024-02-01               31               31   
1       2024-02-15    2024-03-20               34               34   
2       2024-03-10    2024-04-05               26               26   
3       2024-04-05    2024-05-01               26               26   
4       2024-05-12    2024-06-20               39               39   
5       2024-06-01    2024-06-25               24               24   
6       2024-06-18    2024-07-28               40               40   
7       2024-07-02    2024-08-05               34               34   
8       2024-07-25    2024-09-15               52               52   
9       2024-08-10    2024-10-01               52               52   

   Country_Australia  Country_Brazil  Country_Canada  Country_France  \
0              False           False           False           False   
1              False           False           False           F