In [1]:
import pandas as pd
import numpy as np

In [2]:
#using python to download the dataset (uncomment the codes below and execute them)
#import urllib.request
#url = 'https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae'
#filename = 'dataworld.csv'
#urllib.request.urlretrieve(url, filename)

In [3]:
#using pandas to download the dataset (uncomment the code below and execute them)
#df = pd.read_csv('https://query.data.world/s/wh6j7rxy2hvrn4ml75ci62apk5hgae')
#df.to_csv('dataset.csv')

In [4]:
#load the dataset
df = pd.read_csv('dataset.csv', low_memory=False)
df.head()

Unnamed: 0.1,Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
0,0,Armenia,1992,1,AreaPerCap,0.140292,0.199546,0.097188051,0.036888,0.02932,0.0,0.5032351,3A
1,1,Armenia,1992,1,AreaTotHA,483000.0,687000.0,334600.0,127000.0,100943.0008,0.0,1732543.0,3A
2,2,Armenia,1992,1,BiocapPerCap,0.159804,0.135261,0.084003213,0.013742,0.033398,0.0,0.4262086,3A
3,3,Armenia,1992,1,BiocapTotGHA,550176.2427,465677.9722,289207.1078,47311.55172,114982.2793,0.0,1467355.0,3A
4,4,Armenia,1992,1,EFConsPerCap,0.38751,0.189462,1.26e-06,0.004165,0.033398,1.114093,1.728629,3A


In [5]:
#check distribution of target variable
df['QScore'].value_counts()

3A    51481
2A    10576
2B    10096
1A       16
1B       16
Name: QScore, dtype: int64

In [6]:
#checking for null values in the dataset
df.isna().sum()

Unnamed: 0            0
country               0
year                  0
country_code          0
record                0
crop_land         20472
grazing_land      20472
forest_land       20472
fishing_ground    20473
built_up_land     20473
carbon            20473
total                 9
QScore                1
dtype: int64

In [7]:
#for simplicity, we will drop the rows with missing values.
df.dropna(inplace = True)
df.isna().sum()
#An obvious change in our target variable after removing the missing values is that there are only three classes left 
#and from the distribution of the 3 classes, we can see that there is an obvious imbalance between the classes. 
#There are methods that can be applied to handle this imbalance such as oversampling and undersampling.
#Oversampling involves increasing the number of instances in the class with fewer instances while undersampling 
#involves reducing the data points in the class with more instances.

Unnamed: 0        0
country           0
year              0
country_code      0
record            0
crop_land         0
grazing_land      0
forest_land       0
fishing_ground    0
built_up_land     0
carbon            0
total             0
QScore            0
dtype: int64

In [8]:
#drop the unnamed column
#reset the dataframe index
df.drop('Unnamed: 0', axis = 1, inplace = True)
df = df.reset_index(drop = True)

In [9]:
#For now, we will convert this to a binary classification problem by combining class '2A' and '1A'.
df['QScore'] = df.loc[:, 'QScore'].replace(['1A'], '2A')
df.QScore.value_counts()

3A    51473
2A      240
Name: QScore, dtype: int64

In [10]:
df_2A = df[df.QScore=='2A']
df_3A = df[df.QScore=='3A'].sample(350)
data_df = df_2A.append(df_3A)
data_df

Unnamed: 0,country,year,country_code,record,crop_land,grazing_land,forest_land,fishing_ground,built_up_land,carbon,total,QScore
1096,Algeria,2016,4,AreaPerCap,2.072989e-01,8.112722e-01,0.048357265,2.258528e-02,2.998367e-02,0.000000,1.119497e+00,2A
1097,Algeria,2016,4,AreaTotHA,8.417600e+06,3.294260e+07,1963600,9.171000e+05,1.217520e+06,0.000000,4.545842e+07,2A
1098,Algeria,2016,4,BiocapPerCap,2.021916e-01,2.636077e-01,0.027166736,7.947991e-03,2.924496e-02,0.000000,5.301590e-01,2A
1099,Algeria,2016,4,BiocapTotGHA,8.210214e+06,1.070408e+07,1103135.245,3.227369e+05,1.187524e+06,0.000000,2.152769e+07,2A
1100,Algeria,2016,4,EFConsPerCap,6.280528e-01,1.810332e-01,0.162800822,1.472910e-02,2.924496e-02,1.391455,2.407316e+00,2A
...,...,...,...,...,...,...,...,...,...,...,...,...
35721,Eritrea,2011,178,AreaPerCap,1.546476e-01,1.542006e+00,0.341386782,1.644002e+00,4.362179e-02,0.000000,3.725665e+00,3A
48249,Ethiopia,2006,238,AreaPerCap,1.678108e-01,2.661612e-01,0.163082883,1.322753e-01,2.971261e-02,0.000000,7.590428e-01,3A
34069,Philippines,1990,171,EFConsPerCap,3.618597e-01,2.173961e-02,0.163061986,2.654861e-01,4.694141e-02,0.330045,1.189133e+00,3A
4281,Bangladesh,2005,16,AreaTotHA,8.711000e+06,6.000000e+05,1455000,8.002100e+06,2.206280e+06,0.000000,2.097438e+07,3A


In [11]:
import sklearn.utils
data_df = sklearn.utils.shuffle(data_df)
data_df = data_df.reset_index(drop=True)
data_df.shape
data_df.QScore.value_counts()

3A    350
2A    240
Name: QScore, dtype: int64

In [12]:
#more preprocessing
data_df = data_df.drop(columns=['country_code', 'country', 'year'])
X = data_df.drop(columns='QScore')
y = data_df['QScore']

In [13]:
#split the data into training and testing sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

In [14]:
#installing imblearn module (uncomment the pip command to install imblearn)
#!pip install imblearn

In [15]:
#the record feature needs to be encoded
x_train.loc[:, 'record']

285    EFConsTotGHA
113    EFConsPerCap
18     BiocapTotGHA
76     BiocapPerCap
206    BiocapTotGHA
           ...     
277    BiocapTotGHA
9         AreaTotHA
359    EFConsTotGHA
192    EFConsPerCap
559    BiocapPerCap
Name: record, Length: 413, dtype: object

In [16]:
#encode categorical variable
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
x_train.record = encoder.fit_transform(x_train.record)
x_test.record = encoder.transform(x_test.record)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [17]:
#the encoded feature
x_train.record

285    5
113    4
18     3
76     2
206    3
      ..
277    3
9      1
359    5
192    4
559    2
Name: record, Length: 413, dtype: int32

In [18]:
#There is still an imbalance in the class distribution. For this, we use SMOTE only on the training data to handle this.
import imblearn
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1)
x_train_balanced, y_balanced = smote.fit_sample(x_train, y_train)
y_train.value_counts()

3A    246
2A    167
Name: QScore, dtype: int64

In [19]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
normalised_train_df = scaler.fit_transform(x_train_balanced.drop(columns=['record']))
normalised_train_df = pd.DataFrame(normalised_train_df, columns=x_train_balanced.drop(columns=['record']).columns)
normalised_train_df['record'] = x_train_balanced['record']

In [20]:
x_test = x_test.reset_index(drop=True)
normalised_test_df = scaler.transform(x_test.drop(columns=['record']))
normalised_test_df = pd.DataFrame(normalised_test_df, columns=x_test.drop(columns=['record']).columns)
normalised_test_df['record'] = x_test['record']

In [21]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(normalised_train_df, y_balanced)
#returns
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

LogisticRegression()