In this notebook ill try to train model for predicting weight of flying fish based on their characteristics

In [21]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import seaborn as sns

In [22]:
flying_fish_df = pd.read_csv("/data/TrainDataFlight.csv")
flying_fish_df.shape

(119, 7)

In [23]:
flying_fish_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119 entries, 0 to 118
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Species  119 non-null    object 
 1   Length1  119 non-null    float64
 2   Length2  119 non-null    float64
 3   Length3  119 non-null    float64
 4   Height   119 non-null    float64
 5   Width    119 non-null    float64
 6   Weight   119 non-null    float64
dtypes: float64(6), object(1)
memory usage: 6.6+ KB


In [24]:
flying_fish_df.describe()

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight
count,119.0,119.0,119.0,119.0,119.0,119.0
mean,26.836975,29.056303,31.956303,9.186697,4.463197,414.822689
std,10.368023,11.101318,11.964387,4.23788,1.637972,374.921759
min,7.5,8.4,8.8,1.7284,1.1484,0.0
25%,19.35,21.15,23.35,6.11,3.38565,127.5
50%,25.4,27.5,30.1,7.786,4.3056,290.0
75%,33.25,36.2,40.15,12.462,5.47275,667.5
max,59.0,63.4,68.0,18.7542,8.142,1650.0


In [25]:
flying_fish_df.isnull().sum()

Unnamed: 0,0
Species,0
Length1,0
Length2,0
Length3,0
Height,0
Width,0
Weight,0


In [26]:
flying_fish_df.duplicated().sum()

np.int64(0)

In [27]:
flying_fish_df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
Cheilopogon,41
Cypselurus,28
Exocoetus,16
Fodiater,14
Parexocoetus,10
Hirundichthys,8
Prognichthys,2


In [28]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
flying_fish_df['Species_Label'] = encoder.fit_transform(flying_fish_df['Species'])
flying_fish_df

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width,Weight,Species_Label
0,Cypselurus,35.0,38.5,44.1,18.0369,6.3063,920.0,1
1,Hirundichthys,10.0,10.5,11.6,1.9720,1.1600,7.5,4
2,Cypselurus,23.9,26.5,31.1,12.3778,4.6961,340.0,1
3,Exocoetus,24.0,26.0,29.2,8.8768,4.4968,290.0,2
4,Parexocoetus,18.4,20.0,22.4,8.8928,3.2928,150.0,5
...,...,...,...,...,...,...,...,...
114,Parexocoetus,19.0,20.7,23.2,8.5376,3.2944,140.0,5
115,Cypselurus,31.0,33.5,38.7,14.4738,5.7276,650.0,1
116,Cheilopogon,20.0,22.0,23.5,6.1100,3.4075,120.0,0
117,Cheilopogon,27.8,30.0,31.6,7.6156,4.7716,320.0,0


In [29]:
flying_fish_df.describe()

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight,Species_Label
count,119.0,119.0,119.0,119.0,119.0,119.0,119.0
mean,26.836975,29.056303,31.956303,9.186697,4.463197,414.822689,1.647059
std,10.368023,11.101318,11.964387,4.23788,1.637972,374.921759,1.710329
min,7.5,8.4,8.8,1.7284,1.1484,0.0,0.0
25%,19.35,21.15,23.35,6.11,3.38565,127.5,0.0
50%,25.4,27.5,30.1,7.786,4.3056,290.0,1.0
75%,33.25,36.2,40.15,12.462,5.47275,667.5,3.0
max,59.0,63.4,68.0,18.7542,8.142,1650.0,6.0


In [30]:
X = flying_fish_df.drop(['Species', 'Weight'], axis= 1)
y = flying_fish_df['Weight']
X

Unnamed: 0,Length1,Length2,Length3,Height,Width,Species_Label
0,35.0,38.5,44.1,18.0369,6.3063,1
1,10.0,10.5,11.6,1.9720,1.1600,4
2,23.9,26.5,31.1,12.3778,4.6961,1
3,24.0,26.0,29.2,8.8768,4.4968,2
4,18.4,20.0,22.4,8.8928,3.2928,5
...,...,...,...,...,...,...
114,19.0,20.7,23.2,8.5376,3.2944,5
115,31.0,33.5,38.7,14.4738,5.7276,1
116,20.0,22.0,23.5,6.1100,3.4075,0
117,27.8,30.0,31.6,7.6156,4.7716,0


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [32]:
model = LinearRegression()

model.fit(X_train, y_train)

In [33]:
pred = model.predict(X_test)
pred[:10]

array([197.28096409, 574.27565671, 628.81841786,  58.76751739,
       303.38016   , 759.55200694, 454.19406255, 897.42807783,
       806.10432275, 234.9743096 ])

In [34]:
y_test.head()

Unnamed: 0,Weight
4,150.0
91,700.0
105,685.0
103,78.0
31,218.0


In [35]:
mean_squared_error(y_test, pred) ** 0.5 #RMSE

107.1580640918243

In [36]:
r2_score(y_test, pred)

0.9101485202462486

# Try predict the model in real test dataset

In [37]:
real_test = pd.read_csv("/data/TestDataFlight.csv")
real_test.head()

Unnamed: 0,Species,Length1,Length2,Length3,Height,Width
0,Hirundichthys,9.3,9.8,10.8,1.7388,1.0476
1,Fodiater,34.8,37.3,39.8,6.2884,4.0198
2,Parexocoetus,19.0,20.7,23.2,9.396,3.4104
3,Cheilopogon,34.6,37.0,39.3,10.5717,6.3666
4,Exocoetus,21.1,22.5,25.0,6.4,3.8


In [38]:
real_test.shape

(40, 6)

In [43]:
converted_real_test = real_test.drop('Species', axis=1)
converted_real_test['Species_Label'] = encoder.fit_transform(real_test['Species'])
converted_real_test.head()

Unnamed: 0,Length1,Length2,Length3,Height,Width,Weight,Species_Label
0,9.3,9.8,10.8,1.7388,1.0476,0.0,4
1,34.8,37.3,39.8,6.2884,4.0198,570.26,3
2,19.0,20.7,23.2,9.396,3.4104,225.54,5
3,34.6,37.0,39.3,10.5717,6.3666,669.09,0
4,21.1,22.5,25.0,6.4,3.8,185.07,2


In [40]:
import numpy as np

real_test_predict = model.predict(converted_real_test)
final_predict = np.round(real_test_predict, 2)
final_predict = np.where(final_predict < 0, 0, final_predict)
final_predict

array([  0.  , 570.26, 225.54, 669.09, 185.07, 651.01,   0.  , 586.84,
       673.11,   0.  , 434.23, 658.84, 661.44,  78.69, 337.43,   0.  ,
       707.7 , 246.26,   0.  , 154.19, 619.7 , 787.46, 207.59, 310.68,
       375.97, 509.  , 375.24, 609.63, 342.27, 754.66,   0.  ,  13.53,
       827.39,   0.  , 121.24, 158.62, 667.73, 574.65,  73.79, 360.95])

In [44]:
real_test['Weight'] = final_predict
print(real_test.head())
real_test.to_csv('/data/first_subm/TestDataFlight_with_pred.csv', index= False)

         Species  Length1  Length2  Length3   Height   Width  Weight
0  Hirundichthys      9.3      9.8     10.8   1.7388  1.0476    0.00
1       Fodiater     34.8     37.3     39.8   6.2884  4.0198  570.26
2   Parexocoetus     19.0     20.7     23.2   9.3960  3.4104  225.54
3    Cheilopogon     34.6     37.0     39.3  10.5717  6.3666  669.09
4      Exocoetus     21.1     22.5     25.0   6.4000  3.8000  185.07
