In [1]:
import warnings
warnings.filterwarnings('ignore')

In [27]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')

In [5]:
df.head(3)

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A


In [6]:
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1B       16
1A       16
Name: QScore, dtype: int64

In [7]:
df.isna().sum()

country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [8]:
df = df.dropna()
sum(df.isna().sum())

0

In [9]:
df['QScore'] = df['QScore'].replace(['1A'], '2A')

In [10]:
df['QScore'].value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [11]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350, random_state = 2020)

In [12]:
data_df = df_2A.append(df_3A)

In [13]:
import sklearn.utils

In [14]:
data_df = sklearn.utils.shuffle(data_df,random_state = 2020)

In [15]:
data_df = data_df.reset_index(drop=True)

In [16]:
data_df.shape
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [17]:
data_df.drop(columns=['country_code', 'country', 'year'], inplace=True)

In [98]:
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [101]:
y_train.value_counts()

3A    243
2A    170
Name: QScore, dtype: int64

In [102]:
from sklearn.preprocessing import LabelEncoder

In [103]:
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

In [104]:
import imblearn
from imblearn.over_sampling import SMOTE

In [105]:
smote = SMOTE(random_state=1)

In [106]:
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)

In [107]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [108]:
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))

In [109]:
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)

In [110]:
normalised_train_df['record'] = x_train_balanced['record']

In [111]:
x_test = x_test.reset_index(drop=True)

In [112]:
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

In [113]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

In [114]:
log_reg.fit(normalised_train_df, y_balanced)

LogisticRegression()

In [115]:
from sklearn.metrics import recall_score, accuracy_score, precision_score, f1_score, confusion_matrix

In [116]:
new_predictions = log_reg.predict(normalised_test_df)

In [117]:
cnf_mat = confusion_matrix(y_test, new_predictions, labels=['2A', '3A'])

In [118]:
cnf_mat

array([[35, 35],
       [53, 54]], dtype=int64)

In [119]:
from sklearn.model_selection import cross_val_score

In [120]:
scores = cross_val_score(log_reg, normalised_train_df, y_balanced, cv=5, scoring='f1_macro')

In [121]:
scores

array([0.47953764, 0.56325043, 0.50468085, 0.48183761, 0.55193898])

In [122]:
f1 = f1_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('F1: {}'.format(round(f1*100), 2))

F1: 44


In [131]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.split(normalised_train_df) 
f1_scores = []

In [132]:
for train_index, test_index in kf.split(normalised_train_df):
    x_train, x_test = normalised_train_df.iloc[train_index],normalised_train_df.iloc[test_index]
    y_train, y_test = y_balanced[train_index],y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    
    f1_scores.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), 
                   pos_label='2A')*100)

In [133]:
f1_scores

[42.85714285714287,
 54.961832061068705,
 55.55555555555556,
 45.99999999999999,
 0.0]

In [123]:
from sklearn.model_selection import LeaveOneOut
loo = LeaveOneOut()
scores = cross_val_score(LogisticRegression(), normalised_train_df, y_balanced, cv=loo, 
                         scoring='f1_macro')
average_score = scores.mean() * 100

In [124]:
average_score

51.440329218106996

In [126]:
precision = precision_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Precision: {}'.format(round(precision*100), 2))

Precision: 40


In [125]:
recall = recall_score(y_true=y_test, y_pred=new_predictions, pos_label='2A')
print('Recall: {}'.format(round(recall*100), 2)) 

Recall: 50


In [127]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
f1_scores_1 = []
#run for every split
for train_index, test_index in skf.split(normalised_train_df, y_balanced):
    x_train, x_test = np.array(normalised_train_df)[train_index],np.array(normalised_train_df)[test_index]
    y_train, y_test  = y_balanced[train_index], y_balanced[test_index]
    model = LogisticRegression().fit(x_train, y_train)
    #save result to list
    f1_scores_1.append(f1_score(y_true=y_test, y_pred=model.predict(x_test), pos_label='2A'))