In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
#use get dummies  function to transforme the country dataset in number: 1,0,1 ...
# x as country and y as female baby.
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
df= pd.read_csv('newdemography.csv')
df

Unnamed: 0,unique_id,Year,Country / territory of asylum/residence,Location Name,Female 0-4,Female 5-17,Male 0-4,Male 5-17,year_id,country_id,location_id
0,0,2001,Afghanistan,Kabul,0,1,0,0,1,1,1
1,1,2001,Afghanistan,Various,14335,45451,14716,47522,1,1,2
2,2,2001,Afghanistan,Herat,0,0,0,0,1,1,3
3,3,2001,Angola,Viana,484,1687,597,1645,1,2,4
4,4,2001,Angola,Moxico,219,734,226,711,1,2,5
...,...,...,...,...,...,...,...,...,...,...,...
18351,18351,2016,Zambia,Mayukwayukwa : Point,983,1456934,982,1416903,16,103,3703
18352,18352,2016,Zambia,Meheba : Point,1248,23641598,1205,23671693,16,103,3702
18353,18353,2016,Zambia,Zambia : Dispersed in the country / territory,637,1444979,602,13141018,16,103,3705
18354,18354,2016,Zimbabwe,Harare : City,220,11782,204,11591,16,104,4702


### Selecting only 5 coutries for getdummies

In [3]:
# Selecting five countries
selected_countries = ['Afghanistan', 'Pakistan', 'Somalia', 'Angola', 'Myanmar']

In [4]:
# Filtering DataFrame to include only selected countries
filtered_df = df[df['Country / territory of asylum/residence'].isin(selected_countries)]

In [5]:
# Applying get_dummies() function on filtered DataFrame
encoded_df = pd.get_dummies(filtered_df['Country / territory of asylum/residence'])

In [6]:
# Output
encoded_df

Unnamed: 0,Afghanistan,Angola,Myanmar,Pakistan,Somalia
0,1,0,0,0,0
1,1,0,0,0,0
2,1,0,0,0,0
3,0,1,0,0,0
4,0,1,0,0,0
...,...,...,...,...,...
18073,0,0,0,0,1
18074,0,0,0,0,1
18075,0,0,0,0,1
18076,0,0,0,0,1


In [7]:
filtered_df.shape

(1907, 11)

## A dataframe with only "Year" and "female baby"

In [8]:
### We are going to predict the number of female babies asylum seeker by year.

In [9]:
year_babyfemale = filtered_df[['Year', 'Female 0-4']]  # Selected columns

In [10]:
year_babyfemale

Unnamed: 0,Year,Female 0-4
0,2001,0
1,2001,14335
2,2001,0
3,2001,484
4,2001,219
...,...,...
18073,2016,18297
18074,2016,19213
18075,2016,9365
18076,2016,3131


### Concate "encoded_df" with "year_babyfemale"

In [11]:
concatenated_df = pd.concat([encoded_df, year_babyfemale], axis=1)

In [12]:
concatenated_df

Unnamed: 0,Afghanistan,Angola,Myanmar,Pakistan,Somalia,Year,Female 0-4
0,1,0,0,0,0,2001,0
1,1,0,0,0,0,2001,14335
2,1,0,0,0,0,2001,0
3,0,1,0,0,0,2001,484
4,0,1,0,0,0,2001,219
...,...,...,...,...,...,...,...
18073,0,0,0,0,1,2016,18297
18074,0,0,0,0,1,2016,19213
18075,0,0,0,0,1,2016,9365
18076,0,0,0,0,1,2016,3131


In [13]:
encoded_df.dtypes

Afghanistan    uint8
Angola         uint8
Myanmar        uint8
Pakistan       uint8
Somalia        uint8
dtype: object

In [14]:
year_babyfemale.dtypes

Year          int64
Female 0-4    int64
dtype: object

In [15]:
y = concatenated_df["Female 0-4"]
X = concatenated_df.drop("Female 0-4", axis=1)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)
y_train = y_train.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

In [17]:
# Create an instance of the LinearRegression model
model = LinearRegression()


In [18]:
# Fit the model to the training data
model.fit(X_train, y_train)

#now i have a model.

In [19]:
pred_test = model.predict(X_test)
#repass the traning set. 
#compare Y and X

In [20]:
X_test

Unnamed: 0,Afghanistan,Angola,Myanmar,Pakistan,Somalia,Year
11883,0,0,0,1,0,2013
2517,0,1,0,0,0,2005
17883,0,0,0,1,0,2016
15519,1,0,0,0,0,2015
5750,0,0,0,1,0,2008
...,...,...,...,...,...,...
11877,0,0,0,1,0,2013
17878,0,0,0,1,0,2016
7972,0,0,0,1,0,2010
6766,0,0,0,1,0,2009


In [21]:
pred_test

array([ 1196.,  1424.,   616.,  2572.,  2160.,  1004.,  2932.,   652.,
        1968.,  1004.,  2380.,  1580.,  1388.,  4308.,  3924.,   808.,
        1004.,  1772., 15180.,  1004.,  2352.,  1580.,  1196.,  1388.,
        4116.,  1772.,  4308.,   808.,  2572.,  2380.,  3540.,  1580.,
        1004.,  1004.,  1036.,  2572.,  3152.,  1388.,  1968.,   808.,
        1968.,  2352.,  3924.,  1968.,  1196.,  2960.,  2160.,   808.,
        4504.,  2572.,  7516.,  2380.,  2572.,  2352.,  2352.,   616.,
         808.,  2160.,  8484.,  2572.,  8096.,  7516.,  3344.,   616.,
        2352.,  1196.,  3152.,  2572.,  3540.,   808.,   808.,   616.,
        7324.,  1616.,  1388.,  1196.,   808.,  1580.,  2004.,  1968.,
        1196.,  1772.,   616.,  7516.,  1968.,  1004.,  9060.,  1004.,
        1004.,  8868.,  2380.,  2160.,  7516.,  2764.,  2160.,  2160.,
        8288.,  1388.,  2572.,   460.,  2352.,  7708.,  2572.,  4116.,
        1580.,  1968.,  1424.,  2572.,  1004.,  1772.,  4504.,  2352.,
      

In [22]:
y_test

0         3
1         5
2       823
3      2603
4      1223
       ... 
377      75
378     411
379      41
380     106
381    5906
Name: Female 0-4, Length: 382, dtype: int64

# our score

In [23]:
model.score(X_test, y_test)
#pas ouf !!!!

0.008188227435025341