In [1]:
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import jdatetime # To convert Persian (Jalali) date to an English (Gregorian) date

In [3]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score

In [4]:
#Step 1: Import data
file = "/Users/off-centreproductionsmacpro/Documents/GitHub/CYF_Python_Data_Project/data/raw/Raw-DataSet.csv"
df = pd.read_csv(file,  encoding = 'utf-8')

In [5]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   _id                      32 non-null     int64 
 1   internal_id              32 non-null     int64 
 2   contract_id              32 non-null     object
 3   contract_title           32 non-null     object
 4   contract_value           32 non-null     object
 5   contract_date            32 non-null     object
 6   contract_ref_num         32 non-null     object
 7   contract_url             32 non-null     object
 8   contract_finish_date     32 non-null     object
 9   employer                 32 non-null     object
 10  contractor               32 non-null     object
 11  signed_by                32 non-null     object
 12  contract_registery_date  32 non-null     object
 13  contract_type            32 non-null     object
 14  contract_funding         32 non-null     obj

In [6]:
# The value is in Iranian Rial [1000 IR = 0.19 GBP]:
print(df['contract_value'])

0       538,115,195,389
1        10,990,781,147
2     1,211,757,426,886
3     1,758,107,559,936
4     1,262,380,000,000
5       936,407,993,522
6        43,873,974,805
7        36,283,680,925
8     1,498,439,106,404
9     1,896,815,094,565
10       43,999,347,425
11       16,671,309,458
12      266,734,072,022
13    2,552,072,157,584
14      773,205,409,739
15       39,994,604,176
16       20,000,000,000
17    1,303,958,638,252
18      700,899,290,658
19    1,575,592,974,356
20    5,808,778,787,179
21      784,095,103,455
22       32,433,978,200
23       16,474,824,000
24      174,144,413,750
25       49,673,390,000
26      200,972,110,000
27       59,828,690,000
28       23,326,928,485
29       19,383,083,000
30      836,395,666,966
31      113,650,000,000
Name: contract_value, dtype: object


In [7]:
# Remove commas and convert the contract_value string type to numeric type
df['contract_value'] = df['contract_value'].str.replace(',', '', regex=True).astype(float)
print(df['contract_value'])

0     5.381152e+11
1     1.099078e+10
2     1.211757e+12
3     1.758108e+12
4     1.262380e+12
5     9.364080e+11
6     4.387397e+10
7     3.628368e+10
8     1.498439e+12
9     1.896815e+12
10    4.399935e+10
11    1.667131e+10
12    2.667341e+11
13    2.552072e+12
14    7.732054e+11
15    3.999460e+10
16    2.000000e+10
17    1.303959e+12
18    7.008993e+11
19    1.575593e+12
20    5.808779e+12
21    7.840951e+11
22    3.243398e+10
23    1.647482e+10
24    1.741444e+11
25    4.967339e+10
26    2.009721e+11
27    5.982869e+10
28    2.332693e+10
29    1.938308e+10
30    8.363957e+11
31    1.136500e+11
Name: contract_value, dtype: float64


In [8]:
# contract_date is in Persian format (Jalali):
print(df['contract_date'])

0      1395-6-10
1     1395-12-23
2      1396-4-31
3       1396-2-2
4     1396-10-13
5       1397-7-9
6       1397-2-1
7     1394-12-25
8      1395-8-15
9      1395-6-14
10     1397-1-20
11      1397-6-1
12     1398-1-25
13     1398-3-27
14     1398-4-11
15     1397-1-21
16      1397-5-1
17    1395-12-24
18     1395-7-12
19     1397-2-19
20    1395-12-24
21      1395-8-2
22    1390-11-20
23      1393-6-5
24     1392-2-24
25      1391-3-1
26      1391-3-1
27     1393-10-1
28     1393-4-24
29      1393-6-5
30     1396-11-8
31      1395-7-3
Name: contract_date, dtype: object


In [9]:
# Function to convert Persian dates (Jalali) to Gregorian dates:

def convert_to_gregorian(jalali_date):
    return jalali_date.togregorian()

df['contract_date'] = df['contract_date'].apply(lambda x: jdatetime.date(int(x.split('-')[0]), int(x.split('-')[1]), int(x.split('-')[2])).togregorian())

print(df['contract_date'])

0     2016-08-31
1     2017-03-13
2     2017-07-22
3     2017-04-22
4     2018-01-03
5     2018-10-01
6     2018-04-21
7     2016-03-15
8     2016-11-05
9     2016-09-04
10    2018-04-09
11    2018-08-23
12    2019-04-14
13    2019-06-17
14    2019-07-02
15    2018-04-10
16    2018-07-23
17    2017-03-14
18    2016-10-03
19    2018-05-09
20    2017-03-14
21    2016-10-23
22    2012-02-09
23    2014-08-27
24    2013-05-14
25    2012-05-21
26    2012-05-21
27    2014-12-22
28    2014-07-15
29    2014-08-27
30    2018-01-28
31    2016-09-24
Name: contract_date, dtype: object
