# Avocado Data Analysis Notebook
ALT-TAB LABS LLP &copy; 2020 All Rights Reserved

In [1]:
# show files in current directory
import os
os.listdir()

['75%.py',
 '80% - Copy.ipynb',
 '80%-One Hot Encoding & Imputing.ipynb',
 '83%-Datetime Conversion.ipynb',
 'avocado-submission.csv',
 'avocado-test.csv',
 'avocado-train.csv',
 'avocado_notebook_other_models.ipynb',
 'avocado_notebook_ragul.ipynb',
 'pipeline test.ipynb',
 'pyctfsglib.py',
 '__pycache__']

In [19]:
# load csvs
import pandas as pd
df = pd.read_csv('avocado-train.csv', index_col='id')
df.dropna()
df.head()

Unnamed: 0_level_0,DataBatch,Date,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region,AveragePrice
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,7,2016-11-06,183542.31,98949.98,22891.61,95.0,61605.72,43571.99,17499.01,534.72,conventional,NewOrleansMobile,1.49
1,24,2017-07-16,224434.92,42951.31,120360.02,131.85,60991.74,53141.81,3621.04,4228.89,conventional,HarrisburgScranton,1.38
2,51,2015-01-04,3846.69,1500.15,938.35,0.0,1408.19,1071.35,336.84,0.0,organic,Atlanta,1.76
3,22,2015-07-26,91825.07,1679.28,45615.48,741.77,43788.54,43788.54,0.0,0.0,conventional,BuffaloRochester,1.39
4,50,2015-01-11,54644.32,1491.88,33759.12,1325.17,18068.15,12165.94,5902.21,0.0,conventional,Pittsburgh,1.54


In [21]:
### Dealing with Missing Values ########################################################
# https://www.kaggle.com/alexisbcook/missing-values
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

def dropColumns(X_train, X_valid):
    # Get names of columns with missing values
    cols_with_missing = [col for col in X_train.columns
                        if X_train[col].isnull().any()]

    # Drop columns in training and validation data
    reduced_X_train = X_train.drop(cols_with_missing, axis=1)
    reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)
    return reduced_X_train, reduced_X_valid


def imputing(X_train, X_valid):
    # Imputation
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
    imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

    # Imputation removed column names; put them back
    imputed_X_train.columns = X_train.columns
    imputed_X_valid.columns = X_valid.columns
    return imputed_X_train, imputed_X_valid

def imputePlus(X_train, X_valid):
    X_train_plus = X_train.copy()
    X_valid_plus = X_valid.copy()

    # Make new columns indicating what will be imputed
    # Get names of columns with missing values
    cols_with_missing = [col for col in X_train.columns
                        if X_train[col].isnull().any()]
    for col in cols_with_missing:
        X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
        X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

    # Imputation
    from sklearn.impute import SimpleImputer
    my_imputer = SimpleImputer()
    imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
    imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

    # Imputation removed column names; put them back
    imputed_X_train_plus.columns = X_train_plus.columns
    imputed_X_valid_plus.columns = X_valid_plus.columns
    return imputed_X_train_plus, imputed_X_valid_plus

### Dealing with Categorial Variables ##################################################
def getCategories(X_train):
    # Get list of categorical variables
    s = (X_train.dtypes == 'object')
    object_cols = list(s[s].index)
    return object_cols

def dropCategories(X_train, X_valid,object_cols = getCategories(X_train)):
    object_cols = getCategories(X_train)
    drop_X_train = X_train.select_dtypes(exclude=['object'])
    drop_X_valid = X_valid.select_dtypes(exclude=['object'])
    return drop_X_train, drop_X_valid

# Label Encoding
def labelEncoding(X_train, X_valid,object_cols = getCategories(X_train)):
    from sklearn.preprocessing import LabelEncoder
    # Make copy to avoid changing original data 
    label_X_train = X_train.copy()
    label_X_valid = X_valid.copy()

    # Apply label encoder to each column with categorical data
    label_encoder = LabelEncoder()
    for col in object_cols:
        label_X_train[col] = label_encoder.fit_transform(X_train[col])
        label_X_valid[col] = label_encoder.transform(X_valid[col])

    return label_X_train, label_X_valid

### One Hot Encoding
def oneHotEncoding(X_train, X_valid,object_cols = getCategories(X_train)):
    from sklearn.preprocessing import OneHotEncoder
    # Apply one-hot encoder to each column with categorical data
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
    OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

    # One-hot encoding removed index; put it back
    OH_cols_train.index = X_train.index
    OH_cols_valid.index = X_valid.index

    # Remove categorical columns (will replace with one-hot encoding)
    num_X_train = X_train.drop(object_cols, axis=1)
    num_X_valid = X_valid.drop(object_cols, axis=1)

    # Add one-hot encoded columns to numerical features
    OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
    OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

    return OH_X_train, OH_X_valid

In [47]:
# Convert strings to numbers
df["Date"] = pd.to_datetime(df["Date"])
# Clean Data
#import datetime as dt
#df.Date = [dt.date.fromisoformat(x).toordinal() for x in df.Date]

c = ["DataBatch",'TotalVolume',"4046","4225","4770","TotalBags","SmallBags","LargeBags","XLargeBags","AveragePrice"]
'''
for column in c:
    df[column] = pd.to_numeric(df[column])
'''
# Select data for learning
features = c[:-1]+["type","region"]
X = df[features]
Y = df["AveragePrice"]

# Split training into some for training and some for testing
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

from sklearn.model_selection import train_test_split
#TODO TODO TODO TODO TODO TODO TODO TODO
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)#, random_state=42)
print("training split: ", len(X_train), "; test split: ", len(X_test))

training split:  8941 ; test split:  3833


In [48]:
# View the data!
#df.head()
X_train, X_test = labelEncoding(X_train, X_test)#, ['type'])
X_train, X_test = imputePlus(X_train, X_test)

In [49]:
X_train.head()
#y.head()

Unnamed: 0,DataBatch,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region
0,2.0,185141.54,3659.19,50387.09,836.2,130259.06,119351.93,10907.13,0.0,1.0,25.0
1,22.0,1509.8,781.44,199.22,0.0,529.14,516.67,12.47,0.0,1.0,14.0
2,44.0,2373.09,1437.69,715.4,0.0,220.0,220.0,0.0,0.0,1.0,14.0
3,44.0,121048.7,79840.95,10684.39,1447.09,29076.27,21773.29,7302.98,0.0,0.0,20.0
4,46.0,246616.27,64858.54,116792.77,65.38,64899.58,17633.33,47266.25,0.0,1.0,52.0


# Regression Time!

In [25]:
import tensorflow as tf
import logging
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

import numpy as np

In [26]:
# Units refers to the number of neurons
l0 = tf.keras.layers.Dense(units=len(X_train.columns), input_shape=[len(X_train.columns)])
l1 = tf.keras.layers.Dense(units=len(X_train.columns), input_shape=[len(X_train.columns)])
l2 = tf.keras.layers.Dense(units=len(X_train.columns), input_shape=[len(X_train.columns)])
le = tf.keras.layers.Dense(units=1)
model = tf.keras.Sequential([l0, l1, l2, le])
model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(0.001))
model.fit(X_train, Y_train, epochs=1, verbose=True)#,batch_size=1, callbacks=[earlystop_callback])

#print(model.predict(np.array([[100,101], [9,8]])))
print("Finished training the model")
#print("These are the l0 variables: {}".format(l0.get_weights()))
#print("These are the l1 variables: {}".format(l1.get_weights()))

Train on 8941 samples
Finished training the model


In [27]:
tfmodel = model
results = tfmodel.evaluate(X_test,Y_test,batch_size=128, steps=100)
results



2985575168.0

In [50]:
# Sci Kit Learn! Scientific Computing library for python
from sklearn import *
# Pick the regression model we want to use

import sklearn.tree as tree
models = [
    #tree.DecisionTreeRegressor(random_state=2020),
    ensemble.RandomForestRegressor(random_state=None, n_estimators=1000)
]

currScore = 0
model = None
for i in models:
    print(i)
    i.fit(X_train,Y_train)
    score = i.score(X_test,Y_test)
    if score > currScore:
        model = i
        currScore = score

# https://www.kdnuggets.com/2016/10/beginners-guide-neural-networks-python-scikit-learn.html/2


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=1000, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)


In [51]:
# Score model
model.score(X_test, Y_test) #TODO TODO TODO TODO TODO TODO TODO TODO

0.8176263615799223

# Predict for Testing data

In [52]:
# Clean Data
MyDataFrame = pd.read_csv('avocado-test.csv', index_col='id')
# Drops missing values 
#MyDataFrame.dropna()
# Convert strings to numbers

MyDataFrame["Date"] = pd.to_datetime(MyDataFrame["Date"])
#MyDataFrame.Date = [dt.date.fromisoformat(x).toordinal() for x in df.Date]
c = ['TotalVolume',"4046","4225","4770","TotalBags","SmallBags","LargeBags","XLargeBags"]#,"AveragePrice"]
#for column in c:
#    MyDataFrame[column] = pd.to_numeric(MyDataFrame[column])

In [53]:
# Select features
XforPredictions =  MyDataFrame[features]#None #TODO TODO TODO TODO TODO TODO TODO TODO


X_train, X_test = labelEncoding(X, XforPredictions)
X_train, X_test = imputePlus(X_train ,X_test)

#XforPredictions.head()
X_test.head()

Unnamed: 0,DataBatch,TotalVolume,4046,4225,4770,TotalBags,SmallBags,LargeBags,XLargeBags,type,region
0,21.0,28969.34,80.77,27361.91,0.0,1526.66,1526.66,0.0,0.0,1.0,8.0
1,19.0,322962.89,5632.85,242365.82,2441.56,72522.66,22942.48,46083.51,3496.67,0.0,30.0
2,49.0,155334.45,3657.79,74068.65,0.0,77608.01,27302.29,50305.72,0.0,1.0,15.0
3,12.0,10231.74,341.89,8519.0,0.0,1370.85,303.33,1067.52,0.0,1.0,9.0
4,10.0,10652.02,6905.95,39.52,0.0,3706.55,3706.55,0.0,0.0,1.0,18.0


In [54]:
# Make Predictions!
#model.fit(X,Y)
if False:
    yPredictions = tfmodel.predict(X_test)
    #If Using tfmodel
    predictions = []
    for i in yPredictions:
        predictions.append(i[0])
    yPredictions = predictions
else:
    yPredictions = model.predict(X_test)
    print("Predictions:", yPredictions)

Predictions: [1.97224 1.46453 1.28572 ... 1.4239  1.97609 1.65349]


In [55]:
# Save to CSV File!
XforPredictions =  MyDataFrame[features]
output = pd.DataFrame({'id': XforPredictions.index, 'AveragePrice': yPredictions})
output.to_csv('avocado-submission.csv', index=False)
print(output)

id  AveragePrice
0        0       1.97224
1        1       1.46453
2        2       1.28572
3        3       1.58024
4        4       1.48246
...    ...           ...
5470  5470       1.70463
5471  5471       1.37585
5472  5472       1.42390
5473  5473       1.97609
5474  5474       1.65349

[5475 rows x 2 columns]


# Upload for grading

In [56]:
# Download CTFSG Grader Libraries
import urllib.request, os
urllib.request.urlretrieve('https://raw.githubusercontent.com/alttablabs/ctfsg-utils/master/pyctfsglib.py', './pyctfsglib.py')
print('Downloaded pyctfsglib.py:', 'pyctfsglib.py' in os.listdir())

Downloaded pyctfsglib.py: True


In [57]:
# Connect to graders
import pyctfsglib as ctfsg
import random

USER_TOKEN = "MXhtGfdjdsUfiEKTHHEuVGohZESBdMiHrFkmYqNqIFfcWOHGvcubvHJvnxpAqRMh" # You need to fill this up
GRADER_URL = random.choice([
"http://challenges.csdc20t.ctf.sg:30001/", "http://challenges.csdc20t.ctf.sg:30002/"
])
grader = ctfsg.DSGraderClient(GRADER_URL, USER_TOKEN)

DSGraderClient: Successfully Connected!
[SERVER] MOTD: CHECK your USER_TOKEN and GRADER_URL HTTP address! I'm AVOCADO_PRICE GRADER_5021d39ae191


In [58]:
grader.submitFile('avocado-submission.csv')

ProofOfWork Challenge =>  ('CTFSGRB05ce4aec1cc8e0394bee379b916c7e67', 22)
ProofOfWork Answer Found! =>  2022964


'{"challenge":{"name":"Avocado Prices"},"id":"ck8n7jqia767v0702ggqodww9","status":"PARTIALLY_CORRECT","multiplier":0.8279,"submittedBy":{"username":"nyjc-1"},"createdAt":"2020-04-05T15:35:40Z"}'