In [16]:

from imblearn.over_sampling import RandomOverSampler
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


In [17]:
import pandas as pd
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv").sort_values(by="station_id")


In [27]:
normalized_df=(train_data-train_data.min())/(train_data.max()-train_data.min())
normalized_df.drop(columns=['station_id'], inplace = True)
normalized_df

Unnamed: 0,Aluminium_2019,Aluminium_2020,Ammonium_2019,Ammonium_2020,Boron_2019,Boron_2020,Chloride_2019,Chloride_2020,Coli-like-bacteria-Colilert_2019,Coli-like-bacteria-Colilert_2020,...,Taste-ball-units_2020,Taste-dilution-degree_2019,Taste-dilution-degree_2020,Turbidity-NTU_2019,Turbidity-NTU_2020,pH _2019,pH _2020,compliance_2019,compliance_2020,compliance_2021
0,,,0.006623,0.009146,,,,,,,...,0.000000,,,0.028503,0.001368,0.607477,0.534759,0.0,0.0,0.0
1,,,0.006623,0.009146,,,,,,,...,,0.000000,0.000000,0.016459,0.019608,0.373832,0.267380,0.0,0.0,0.0
2,,0.130016,0.006623,0.067073,,0.011832,,,,,...,,0.000000,0.000000,0.016459,0.019608,0.607477,0.588235,0.0,0.0,0.0
3,,,0.018140,0.009146,0.172796,0.120010,,,,,...,0.333333,,,0.006423,0.008208,0.439252,0.524064,0.0,0.0,0.0
4,,,0.009502,0.009146,,,,,,,...,0.000000,,,0.000401,1.000000,0.560748,0.748663,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,,,0.095883,,,,,,0.0,0.0,...,,0.000000,0.066667,0.016459,0.019608,0.514019,0.588235,0.0,0.0,1.0
436,,,,,,,,,,,...,,0.000000,0.000000,0.016459,0.019608,0.233645,0.481283,0.0,0.0,0.0
437,,0.064995,,0.076220,,0.034456,,0.028461,0.0,0.0,...,,0.200000,,0.217182,0.247606,0.514019,0.534759,1.0,1.0,1.0
438,,,0.006623,0.009146,,,,,,,...,0.000000,,,0.012445,0.003648,0.794393,0.748663,1.0,0.0,0.0


In [18]:
#verify that each station id is unique
print(train_data["station_id"].is_unique)

train_data_rm_col = train_data.copy(deep=True)
test_data_rm_col = test_data.copy(deep=True)

#see how many nan values in columns
print(train_data.shape)
print()
for column in train_data.columns:
    print(column + "     Nan count: " + str(train_data[column].isna().sum()))
    if (train_data_rm_col[column].isna().sum() > 220):
        
        train_data_rm_col.drop(columns=[column])
        test_data_rm_col.drop(columns=[column])


True
(440, 58)

station_id     Nan count: 0
Aluminium_2019     Nan count: 337
Aluminium_2020     Nan count: 346
Ammonium_2019     Nan count: 133
Ammonium_2020     Nan count: 150
Boron_2019     Nan count: 324
Boron_2020     Nan count: 332
Chloride_2019     Nan count: 329
Chloride_2020     Nan count: 326
Coli-like-bacteria-Colilert_2019     Nan count: 315
Coli-like-bacteria-Colilert_2020     Nan count: 315
Coli-like-bacteria_2019     Nan count: 118
Coli-like-bacteria_2020     Nan count: 120
Colony-count-at-22-C_2019     Nan count: 70
Colony-count-at-22-C_2020     Nan count: 53
Color-Pt-Co-unit_2019     Nan count: 381
Color-Pt-Co-unit_2020     Nan count: 377
Color-Pt/Co-scale_2019     Nan count: 68
Color-Pt/Co-scale_2020     Nan count: 61
Electrical-conductivity_2019     Nan count: 10
Electrical-conductivity_2020     Nan count: 4
Enterococci_2019     Nan count: 256
Enterococci_2020     Nan count: 267
Escherichia-coli-Colilert_2019     Nan count: 316
Escherichia-coli-Colilert_2020     Nan 

In [23]:
train_data_nan_mean = train_data.copy(deep=True)
test_data_nan_mean = test_data.copy(deep=True)

def nan_to_mean(train_data_nan_mean,test_data_nan_mean):
    for col in train_data_nan_mean.columns:
        train_data_nan_mean[col].fillna(value=train_data_nan_mean[col].mean(), inplace=True)
    for col in test_data_nan_mean.columns:
        test_data_nan_mean[col].fillna(value=test_data_nan_mean[col].mean(), inplace=True)
    
    return (train_data_nan_mean,test_data_nan_mean)
    
means = nan_to_mean(train_data_nan_mean,test_data_nan_mean)
train_data_nan_mean = means[0]
test_data_nan_mean = means[1]
test_data_nan_mean


Unnamed: 0,station_id,Aluminium_2019,Aluminium_2020,Ammonium_2019,Ammonium_2020,Boron_2019,Boron_2020,Chloride_2019,Chloride_2020,Coli-like-bacteria-Colilert_2019,...,Taste-ball-units_2019,Taste-ball-units_2020,Taste-dilution-degree_2019,Taste-dilution-degree_2020,Turbidity-NTU_2019,Turbidity-NTU_2020,pH _2019,pH _2020,compliance_2019,compliance_2020
0,163,5.000000,5.000000,0.080000,0.080000,0.071000,0.062000,130.000000,102.000000,0.544828,...,0.447368,0.512195,1.000000,1.0,1.180000,1.90,8.170000,8.12,0,0
1,167,13.143023,13.053659,0.080000,0.080000,4.311006,0.301936,71.472549,53.947321,0.544828,...,0.447368,0.512195,1.000000,3.0,3.900000,1.54,7.830000,7.81,1,0
2,171,13.143023,13.053659,0.108809,0.110773,4.311006,0.301936,112.000000,90.000000,0.544828,...,0.447368,0.512195,2.000000,1.0,1.400000,1.50,7.500000,7.60,0,1
3,174,5.000000,5.000000,0.050000,0.090000,0.072000,0.075000,248.000000,243.000000,0.000000,...,0.447368,0.512195,2.000000,8.0,1.000000,1.00,7.700000,7.80,0,0
4,178,13.143023,13.053659,0.050000,0.110773,4.311006,0.301936,71.472549,53.947321,0.544828,...,0.447368,0.512195,1.000000,1.0,1.000000,1.00,7.500000,7.40,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,1941,13.143023,13.053659,0.050000,0.050000,4.311006,0.301936,71.472549,53.947321,0.544828,...,0.447368,0.512195,1.385827,2.0,4.800000,1.00,9.200000,6.70,1,1
185,2087,5.000000,13.053659,0.108809,0.110773,0.855000,0.301936,417.000000,53.947321,0.000000,...,0.447368,0.512195,1.385827,2.0,1.100000,1.00,7.700000,7.60,1,0
186,2206,27.200000,31.100000,0.050000,0.050000,0.018000,0.013000,41.000000,35.000000,1.000000,...,0.447368,0.512195,2.000000,2.0,1.000000,1.00,7.500000,7.40,1,1
187,2303,13.143023,5.000000,0.108809,0.050000,4.311006,0.147000,71.472549,40.000000,0.000000,...,0.447368,0.512195,1.385827,4.0,1.284111,1.80,7.583552,8.00,0,1


In [5]:
#remove the remaining nan values from train_data_rm_col and test_data_rm_col
means = nan_to_mean(train_data_rm_col,test_data_rm_col)
train_data_rm_col = means[0]
test_data_rm_col = means[1]

X_train_rm_col = train_data_rm_col.drop(columns=["compliance_2021"])
y_train_rm_col = train_data_rm_col["compliance_2021"]


In [6]:

y_train_nan_mean = train_data_nan_mean["compliance_2021"]
X_train_nan_mean = train_data_nan_mean.drop(columns=["compliance_2021"])





In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
def randomForest(train_X,train_y,test_data):
    rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0)
    rf.fit(train_X, train_y)
    rf_outcome = pd.DataFrame(rf.predict(test_data))
    stationID = test_data.station_id
    rf_outcome = pd.concat([stationID, rf_outcome], axis = 1)
    return rf_outcome

rf_outcome = randomForest(X_train_nan_mean,y_train_nan_mean,test_data_nan_mean)

#compression_opts = dict(method='zip', archive_name='randomforest_1.csv')

#rf_outcome.to_csv('out.zip', compression = compression_opts)

rf2_outcome = randomForest(X_train_rm_col,y_train_rm_col,test_data_rm_col)

#compression_opts = dict(method='zip', archive_name='randomforest_2.csv')

#rf_outcome.to_csv('out.zip', compression = compression_opts)

In [8]:
def svc(X_train,y_train, test_data):
    
    clf = make_pipeline(StandardScaler(), SVC(gamma='auto',probability=False))
    clf.fit(X_train, y_train)
    outcome = pd.DataFrame(clf.predict(test_data))
    stationID = test_data.station_id
    outcome = pd.concat([stationID, outcome], axis = 1)
    return outcome

In [9]:
svc_outcome = svc(X_train_nan_mean,y_train_nan_mean, test_data_nan_mean)
#compression_opts = dict(method='zip', archive_name='svm_svc_1.csv')
#svc_outcome.to_csv('out.zip', compression = compression_opts)"""

#Ei tööta. Ennustab ainult 0-le ja mitte 1


In [10]:
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X_train_nan_mean, y_train_nan_mean)

outcome = svc(X_resampled, y_resampled, test_data_nan_mean)

#compression_opts = dict(method='zip', archive_name='svm_svc_oversampled.csv')
#outcome.to_csv('out.zip', compression = compression_opts)



In [26]:
from sklearn.neural_network import MLPClassifier

def MLP(train_X,train_y,test_data):
    mlp_model = MLPClassifier((30,42,30),random_state=1).fit(train_X,train_y)
    outcome = pd.DataFrame(mlp_model.predict(test_data))
    stationID = test_data.station_id
    outcome = pd.concat([stationID, outcome], axis = 1)
    return outcome

outcome = MLP(X_train_nan_mean,y_train_nan_mean,test_data_nan_mean)
compression_opts = dict(method='zip', archive_name='MLP_1.csv')
outcome.to_csv('out.zip', compression = compression_opts)

In [11]:
#separate 2019 and 2020 data
df2019 = train_data.loc[:, train_data.columns.str.contains('station|2019')]
df2020 = train_data.loc[:, train_data.columns.str.contains('station|2020')]
df2020

Unnamed: 0,station_id,Aluminium_2020,Ammonium_2020,Boron_2020,Chloride_2020,Coli-like-bacteria-Colilert_2020,Coli-like-bacteria_2020,Colony-count-at-22-C_2020,Color-Pt-Co-unit_2020,Color-Pt/Co-scale_2020,...,Odour-dilution-level_2020,Oxidability_2020,Smell-ball-units_2020,Sodium_2020,Sulphate_2020,Taste-ball-units_2020,Taste-dilution-degree_2020,Turbidity-NTU_2020,pH _2020,compliance_2020
0,487,,0.05,,,,0.0,,1.9,,...,,,0.0,,,0.0,,0.2,7.60,0
1,1555,,0.05,,,,0.0,0.0,,0.0,...,1.0,,,,,,1.0,1.0,7.10,0
2,205,10.0,0.24,0.100,,,0.0,0.0,,0.0,...,1.0,,,,,,1.0,1.0,7.70,0
3,1228,,0.05,0.932,,,0.0,2.0,,4.0,...,,,1.0,,,1.0,,0.5,7.58,0
4,470,,0.05,,,,2.0,31.0,5.0,,...,,,0.0,,,0.0,,44.0,8.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435,830,,,,,0.0,,3.0,,3.0,...,2.0,,,,,,2.0,1.0,7.70,0
436,803,,,,,,0.0,10.0,,5.0,...,1.0,,,,,,1.0,1.0,7.50,0
437,1081,5.0,0.27,0.274,14.0,0.0,,66.0,,6.0,...,4.0,3.10,,28.4,3.0,,,11.0,7.60,1
438,458,,0.05,,,,0.0,1.0,2.7,,...,,,0.0,,,0.0,,0.3,8.00,0


In [12]:
print(data.columns)

NameError: name 'data' is not defined

In [None]:
piirsisaldused = {"Aluminium":200, "Ammoonium":0.5, "Boron": 1, "Chloride":250,
                 "Colony-count-at-22-C":100, "Electrical-conductivity":2500, "Enterococci":0,
                 "Escherichia-coli-Colilert":0, "Fluoride":1.5, "Iron": 200,
                 "Manganese":50, "Nitrate":50, "Sodium":200, "Sulphate":250, 
                 "pH ":9.5}

In [None]:
baseDf = data.drop(columns=['Coli-like-bacteria-Colilert_2019',
       'Coli-like-bacteria-Colilert_2020', 'Coli-like-bacteria_2019',
       'Coli-like-bacteria_2020', 'Odour-dilution-level_2019',
       'Odour-dilution-level_2020', 'Oxidability_2019', 'Oxidability_2020',
       'Smell-ball-units_2019', 'Smell-ball-units_2020','Taste-ball-units_2019', 'Taste-ball-units_2020',
       'Taste-dilution-degree_2019', 'Taste-dilution-degree_2020',
       'Turbidity-NTU_2019', 'Turbidity-NTU_2020' ])
for col in baseDf.columns:
    df=baseDf[[col, "compliance_2020"]]
    df = df.dropna()
    for item in piirsisaldused.items():
        if col.startswith(item[0]):
            limit = item[1]
            print(limit)
    print(df[df[col] > limit])
    


In [None]:
#exploring data
from matplotlib import pyplot as plt
import seaborn as sns

plt.figure(figsize = (30,10))

print("2019 info :")
print(df2019.describe())

print()
print()

print("2020 info :")
print(df2020.describe())
print()

print("2019 info :")
print(df2019.info())



In [None]:
#visualizing correlation with result variable to get an idea of what feature might be important.

sns.pairplot(df2020, x_vars = ["Aluminium_2020", "Ammonium_2020", "Boron_2020","Chloride_2020", "Coli-like-bacteria-Colilert_2020", "Coli-like-bacteria_2020", "Colony-count-at-22-C_2020", "Color-Pt-Co-unit_2020", "Color-Pt/Co-scale_2020", "Odour-dilution-level_2020", "Oxidability_2020", "Smell-ball-units_2020", "Sodium_2020", "Sulphate_2020", "Taste-ball-units_2020", "Taste-dilution-degree_2020", "Turbidity-NTU_2020" ,"pH _2020"], y_vars = ["compliance_2020"])

In [None]:
#normalize the data (min-max normalization)
normalized_df2019=(df2019-df2019.min())/(df2019.max()-df2019.min())
normalized_df2019.drop(columns=['station_id'], inplace = True)
print(normalized_df2019[["Ammonium_2019","compliance_2019"]].to_string())

In [None]:
#try to predict with every column separately

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

def ModelForEachCol(data):
    features = data.iloc[:,-1]
    data = data.drop(columns=data.columns[-1:])
    
    for col in data.columns:
        pair = pd.concat([data[col], features], axis=1)
        pair.dropna(inplace=True)
        
        X_train, X_test, y_train, y_test = train_test_split(pair.iloc[:,0].values.reshape(-1,1), pair.iloc[:,1], train_size = 0.60, random_state = 1)
        
        rf = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=0).fit(X_train, y_train)
        
        y_pred = rf.predict(X_test)
        confusion_matrix_result = confusion_matrix(y_test.values, y_pred, labels=[1,0])  # Label=[1,0], otherwise predicted and actual axes are swapped
        print("Confusion matrix:\n%s" % confusion_matrix_result)
            
ModelForEachCol(normalized_df2019)

In [None]:
normalized_df2019.drop(columns=["Aluminium_2019","Boron_2019","Chloride_2019", "Coli-like-bacteria-Colilert_2019", 
                                "Color-Pt-Co-unit_2019","Escherichia-coli-Colilert_2019"])
print(normalized_df2019.shape)
counts = normalized_df2019.count(axis=1)
for i in range(len(counts)):
    if counts[i] < 15:
        normalized_df2019.drop([i],axis=0, inplace=True)

print(normalized_df2019.shape)
