In [None]:
import scipy.stats
import scipy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from read_data import get_training, get_test, get_Doc2Vec, get_data
from processing import combine_with_vec, exclude_non_numeric

from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
sns.set(rc={"figure.facecolor": "white"})

In [None]:
X, y = get_training()
train_name_vec50, train_ingr_vec50, train_steps_vec50 = get_Doc2Vec(data="train", num_features=50)
train_name_vec100, train_ingr_vec100, train_steps_vec100 = get_Doc2Vec(data="train", num_features=100)

In [None]:
# Combine Doc2Vec with 50 features and non-numeric train data
temp_X = X.copy()
temp_X = exclude_non_numeric(temp_X)
temp_train_name_vec50 = train_name_vec50.copy()
temp_train_ingr_vec50 = train_ingr_vec50.copy()
temp_train_steps_vec50 = train_steps_vec50.copy()
X_combined_50: pd.DataFrame = combine_with_vec(temp_X, temp_train_name_vec50, temp_train_ingr_vec50, temp_train_steps_vec50)

## Target

In [None]:
ax = sns.countplot(y, label="Count")
d1, d2, d3 = y.value_counts()
print("Duration 1.0", d1)
print("Duration 2.0", d2)
print("Duration 3.0", d3)

In [None]:
data = pd.concat([y, exclude_non_numeric(X)], axis=1)
data = pd.melt(data, id_vars="duration_label", var_name="features", value_name="value")
plt.figure(figsize=(10, 10))
sns.boxplot(x="features", y="value", hue="duration_label", data=data)

In [None]:
X_combined_50.corrwith(y).sort_values(key= lambda x: np.abs(x), ascending=False)[:20]

# Feature analysis (Pre-Processing)
## n_steps

In [None]:
sns.distplot(X["n_steps"], fit=scipy.stats.norm)
plt.ylabel("Frequency")
plt.title("Frequency histogram of n_steps")

In [None]:
res = scipy.stats.probplot(X["n_steps"], plot=plt)
plt.title("Probability Plot of n_steps")

In [None]:
sns.distplot(np.log1p(X["n_steps"]), fit=scipy.stats.norm)
plt.ylabel("Frequency")
plt.title("Frequency histogram of log(n_steps + 1)")

In [None]:
res = scipy.stats.probplot(np.log1p(X["n_steps"]), plot=plt)
plt.title("Probability Plot of log(n_steps + 1)")

## n_ingredients

In [None]:
sns.distplot(X["n_ingredients"], fit=scipy.stats.norm)
plt.ylabel("Frequency")
plt.title("Frequency histogram of n_ingredients")

In [None]:
res = scipy.stats.probplot(X["n_ingredients"], plot=plt)
plt.title("Probability Plot of n_ingredients")

In [None]:
sns.distplot(np.log1p(X["n_ingredients"]), fit=scipy.stats.norm)
plt.ylabel("Frequency")
plt.title("Frequency histogram of log(n_ingredients + 1)")

In [None]:
res = scipy.stats.probplot(np.log1p(X["n_ingredients"]), plot=plt)
plt.title("Probability Plot of log(n_ingredient + 1)")

## 50Vec2Doc

In [None]:
scaled_train_name_vec50 = StandardScaler().fit_transform(train_name_vec50)



In [None]:
sns.distplot(scaled_train_name_vec50[:, 0], fit=scipy.stats.norm)
plt.ylabel("Frequency")
plt.title("Frequency histogram of log(n_ingredients + 1)")

In [None]:
res = scipy.stats.probplot(scaled_train_name_vec50[:, 0], plot=plt)
plt.title("Probability Plot of Doc2Vec feature 0")

In [None]:
df = get_data("./COMP30027_2021_Project2_datasets/recipe_train.csv")

In [None]:
sns.distplot(df[df["duration_label"]==3.0]["n_steps"])
print(df[df["duration_label"]==3.0]["n_steps"].mean())

In [None]:
sns.distplot(df[df["duration_label"]==2.0]["n_steps"])
print(df[df["duration_label"]==2.0]["n_steps"].mean())

In [None]:
sns.distplot(df[df["duration_label"]==1.0]["n_steps"])
print(df[df["duration_label"]==1.0]["n_steps"].mean())