# Importing libraries 

In [None]:
import pandas as pd # algebraic computations , read and write to csv 

import numpy as np # linear algebra and arrays muniplication 


############  importing the data visualization libraries :##############

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm # Creating a normaly distributed curve

##################### Importing the Regression Models ####################
from sklearn. ensemble import RandomForestRegressor # Random forest regressor model
from sklearn.linear_model import Ridge
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso

# Importing a model to split the training set from the evaluation set
from sklearn. model_selection import train_test_split 

############# R2 as the unit of evaluation measure #################
from sklearn. metrics import r2_score                

############  RandomizedSearchCV #################
from sklearn.model_selection import RandomizedSearchCV

################## Tuxtual data prprocessing ##################
from sklearn. feature_extraction.text import TfidfVectorizer 
import warnings
warnings.filterwarnings("ignore")
from os import path
from PIL import Image
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords as sw
import re
import string

############################## Import Utility Functions ################################

from packages.utils import *  # import utils from packages customized module contain all neccessary function for the project 

%matplotlib inline
plt.style.use('seaborn-notebook' ) # plotting style

In [None]:
df_dev = pd.read_csv('./development.tsv',sep="\t")
df_eval = pd.read_csv('./evaluation.tsv',sep="\t")

## Data Exploration and Feature engineering :

In [None]:
# exploring the features  
print(f"the lenght of the development_set = {len(df_dev)} " )
print(f"the lenght of the evaluation_set = {len(df_eval)} " )
df_dev.head(5)

In [None]:
list(df_dev.columns)

In [None]:
#  Now we can merge the development and the evaluation data sets  for better data cleaning and preprocessing synchronously 
df = pd.concat([df_dev, df_eval], sort=False )
len(df_dev), len(df_eval), len(df)

In [None]:
# to better deal with the columns we will rename them with single word each 
rename = {"beer/ABV" : "abv", "beer/name" : "beerName" ,"beer/style" : "beerstyle" , "review/appearance" :"apperance" , "review/aroma" : "aroma" ,'review/overall' : 'overall' , 'review/palate':'palate' , \
          'review/taste' :"taste" , 'review/text' : 'text' , 'user/gender' : "gender" , 'user/profileName' : "profilename" ,'user/ageInSeconds' : 'age' }
df.rename(columns = rename , inplace = True)

In [None]:
# checking the % of NANs value in the features 
print(df.drop(["overall"],axis=1).isnull().sum(axis=0)*100/len(df),"\n")

In [None]:
# copy version to be used in version 2 of the code
df_v2 = df.copy(deep=True)

## Target Variable Exploration :

In [None]:
# To analyize the distribution of quality we will use seaborn to visualize the distribution

reviews = df["overall"].dropna() # in order to plot we need to drop the NAN values comes from the evaluation set
sns.distplot(x=df["overall"],norm_hist=False )
plt.grid(axis="y")
plt.show

In [None]:
df.nunique()

In [None]:
# We found that the target variable is skewed left and the median is 4 
sns.boxplot(x=reviews)

In [None]:
std = np.std(reviews, ddof= 1)
mean = np.mean(reviews)
median = np.median(reviews)

print(f"Reviews is not normaly distributed with mean = {mean:.2f} and standard deviation = {std:.2f} and median = {median:.2f}")

## V1- simple Imputation without text analysis

### "profileName"

In [None]:
print(f"the feature profilename has cradinality of {df.profilename.nunique()} \n ")
data_visualization_histogram("profilename" , df , 20)

In [None]:
# "user/profileName" have 
x = df["profilename"].copy()
x[x.replace(x.value_counts().to_dict()) < 50] = 'other values'
x.nunique()

In [None]:
df["profilename"] = x

In [None]:
# replacing the NaN  values with the most frequent value 

df["profilename"].fillna(df["profilename"].value_counts().index[0], inplace=True)

In [None]:
df["profilename"].isnull().sum()

In [None]:
# before handling beername and style we must check if they are redundant features 
mask = df["beerstyle"] == df["beerName"]
print(f"there is {df.loc[mask ,['beerstyle','beerName'] ].nunique()} redundant unique values between the 2 features")
len(df.loc[mask ,["beerstyle" , "beerName"] ])/len(df["beerstyle"])

### 'beer/name'

In [None]:
print(f"the feature profilename has cradinality of {df.beerName.nunique()} ")
data_visualization_histogram("beerName" , df , 20)

In [None]:
# "beerName" after discritizing it 
x = df.beerName.copy()
x[x.replace(x.value_counts().to_dict()) < 130] = 'others'
x.nunique()

In [None]:
df["beerName"] = x

In [None]:
# no nulls in this attribute
df["beerName"].isnull().sum()

### beer/Style

In [None]:
df.beerstyle.nunique()

In [None]:
print(f"the feature profilename has cradinality of {df.beerstyle.nunique()} ")
data_visualization_histogram("beerstyle" , df , 20)

In [None]:
# "beerName" after discritizing it 
x = df["beerstyle"].copy()
x[x.replace(x.value_counts().to_dict()) < 100] = 'others'
x.nunique()

In [None]:
df["beerstyle"] = x

### 'beer/ABV'  alcahole per volume 

In [None]:
# 'beer/ABV'  alcahole per volume  we check the NANs 
df['abv'].isnull().sum()
p =  df['abv'].isnull().sum()/len(df['abv'])
print(f"the percentage of nulls = {p}" )

In [None]:
# we replace the NANs with the mean value of ABV
mean_ABV = df['abv'].mean()
df['abv'].fillna(value = mean_ABV , inplace = True)
p =  df['abv'].isnull().sum()/len(df['abv'])
print(f"the percentage of nulls = {p}" )

### user/gender  

In [None]:
# we found it higly unbalanced and contain over 60% missing values 
nulls = df["gender"].isnull().sum() / len(df["gender"])
print(f" gender containg {nulls*100:.2f} % null values " )

In [None]:
df_dev["user/gender"].value_counts()

In [None]:
list(rename.values())

#### Categorical Feature Transformation 

In [None]:
# we will exclude "Text review" attribute for now hence it needs other type of transformation 
cat_col = ['beerName' , 'beerstyle' , 'profilename' ]
df_one_h = pd.get_dummies(df,columns= cat_col , drop_first=True) #we set drop_first to true to remove the orininagl encoded columns
df_one_h.shape

In [None]:
df.columns

### V1-Training without Text Analysis & gender 

In [None]:
Dropped_Columns = [ 'text', "age", 'user/birthdayRaw','user/birthdayUnix', 'gender']

In [None]:
X_train, X_valid, y_train, y_valid ,X_train_valid ,y_train_valid,X_test,y_test,feature_names ,index  = train_val_test(Dropped_Columns ,df_one_h)

### training models baselines  

In [None]:
# Lasso Default :
lasso_default = Lasso(tol=0.1 , alpha = 0.1)
make_regression(lasso_default ,X_train , y_train  ,X_valid, y_valid )

# Ridge Regularization model:
ridge_default =  Ridge()
make_regression(ridge_default ,X_train , y_train ,X_valid, y_valid  )

# RANDOM FOREST REGRESSOR 
RF_default = RandomForestRegressor(random_state=42 , n_jobs=-1)
make_regression(RF_default ,X_train , y_train , X_valid, y_valid )

#### Predection of Basline Models on Test Data :

In [None]:
print_results(RF_default,X_train_valid, y_train_valid,X_test ,"RandomForest_Default" ,index)
print_results(lasso_default ,X_train_valid, y_train_valid,X_test,"lasso_default" , index)
print_results(ridge_default,X_train_valid, y_train_valid,X_test, "Ridge_default" , index)


### Randomize Grid Search : 

##### Ridge HyperParameters 

In [None]:
# aplha the regularization parameter
alpha = [float(x) for x in np.linspace(start = 0.01, stop = 3, num = 10)] # 10
alpha.append(2.0)
# Fit_Intercept 
fit_intercept = [True, False] # 2
#Tolerance :
tol = [0.001,0.01 , 0.005]
# solver 
solver = ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'] # 7

Ridge_random_grid = {'fit_intercept': fit_intercept,
               'alpha': alpha,
               'solver': solver,
                'tol'  : tol,
               }

##### Random Forest Hyper Parameters

In [None]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 1200, num = 21)]
# criterion The function to measure the quality of a split
criterion = ['mse', 'MAE']
# Number of features to consider at every split
max_features = ['auto', 'sqrt', 'log2']
# Minimum number of samples required to split a node
min_samples_split = [20, 15, 10]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
RF_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'criterion': criterion,
               'min_samples_split': min_samples_split,
               'bootstrap': bootstrap }

##### lasso Hyper Parameters

In [None]:
# the regularization Term
alphas = np.array([ 0.01, 0.001, 0.0001,0.00001,0.000001 , 0.0000001])
# selection 
selection = ['random' , 'cyclic']

lasso_grid = { 'alpha' : alphas,
               'selection':selection ,
                }

### Randomize Grid Search Results :

In [None]:
lasso_d = Lasso()
Lasso_best = RandomizeGridSearch(lasso_d ,lasso_grid ,X_train_valid , y_train_valid, n_iter=100 ,cv=10)

In [None]:
ridge = Ridge()
Ridge_best = RandomizeGridSearch ( ridge , Ridge_random_grid ,X_train_valid , y_train_valid,100 ,10)

In [None]:
RF = RandomForestRegressor(random_state=42 )
RF_best = RandomizeGridSearch ( RF , RF_grid ,X_train_valid , y_train_valid , 150 ,3)

#### Predection of Best Models on Test Data :

In [None]:
print_results(Lasso_best,X_train_valid, y_train_valid,X_test ,"Lasso_best_V1" ,index)
print_results(Ridge_best ,X_train_valid, y_train_valid,X_test,"Ridge_best_V1" , index)
print_results(RF_best,X_train_valid, y_train_valid,X_test, "RF_best_V1" , index)


# V2-Text Analysis + Advanced Imputation + Gender Feature 

### A-Handling Missing Values with Advanced Imputation 

#### 1- user/gender + ABV 

In [None]:
nulls = df_v2["gender"].isnull().sum() / len(df_v2["gender"])
print(f" gender containg {nulls*100:.2f} % null values " )

In [None]:
nulls = df_v2["abv"].isnull().sum() / len(df_v2["abv"])
print(f" abv containg {nulls*100:.2f} % null values ")

In [None]:
# The Gender feature is higly impalanced so we need a better imputation methood than the simple imputer with frequancy 
df_v2["gender"].value_counts()

#### KNN imputation 

In [None]:
## we will consider only the numerical features including the target variable 
## we don't consider the target variable because we want to apply the same transformation to both eval and dev set and eval has no column for the target variable 
features = ['abv', 'apperance', 'aroma','palate', 'taste' , 'gender' , 'overall','age']
df_c = df_v2[features].copy(deep=True)
# we match the index 
df_c.set_index(df_v2.index , inplace = True)
print(df_c.columns)
print(np.sum(df_c.index == df_v2.index))

In [None]:
# df_c_train is to be used to fit the scaler
train_mask = ~df_c["overall"].isna()
df_c_train = df_c.copy(deep=True).loc[train_mask]

In [None]:
def mapage (age) :
    age_days = float(age) // (24 * 3600)
    return age_days 
df_c['agedays']=df_c['age'].map(lambda age : mapage(age))

In [None]:
df_c_train['agedays']=df_c_train['age'].map(lambda age : mapage(age))

In [None]:
# first we encode the gender to [0,1] normalize the features :
df_c["gender"].replace({'Male': int(0), "Female": int(1)} , inplace=True)
df_c_train["gender"].replace({'Male': int(0), "Female": int(1)} , inplace=True)
print(df_c.gender.value_counts(dropna=False) )
print(df_c_train.gender.value_counts(dropna=False) )

In [None]:
df_c.drop(columns=['overall','age'] , inplace=True)
df_c_train.drop(columns=['overall','age'], inplace=True)

In [None]:
# Using Min Max scaler to normalize the features before feeding KNN imputer because // measurment distance classifier need normalizeed feature
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# we fit on the training data 
scaler.fit(df_c_train)
# we used the fitted scaler to transform all on the training + Test  data 
df_c[df_c.columns] = scaler.transform(df_c[df_c.columns])

In [None]:
df_c.head(5)

In [None]:
### use the KNN imputer to impute the missing values 
## we set the k = 50 to have a a more accurate results due to we may have more than 20 consecutive NANS in the neighbors 
## because KNN imputer produce floats we need to round the output for the gender to have [0/1]
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=50)
imputed_gender = imputer.fit_transform(df_c)

In [None]:
imputed_gender_df=pd.DataFrame(data = imputed_gender,columns= df_c.columns  , index = df_v2.index)

In [None]:
# imputed_gender_df.head()

In [None]:
## replacing the columns after imputation 
df_v2.drop(columns=["gender", "abv" ,'apperance', 'aroma','palate', 'taste','age'] , inplace=True)

In [None]:
df_v2 = pd.concat([df_v2, imputed_gender_df], axis=1)

In [None]:
# checking the shapes and indexes are true after the concatination 
print(df_v2.shape)
print(imputed_gender_df.shape )
sum(imputed_gender_df.index == df_v2.index ) / len(df_v2) 

In [None]:
df_v2.loc[:, ["gender", "abv" ,'apperance', 'aroma','palate', 'taste'] ].describe()

#### 2- "profilename"

In [None]:
### for this we can invent another strategy to group by quality and replace the most frequent ber review group we have 9 categories for each we can compute to most frequent profilename 
# hence we will reduce the unbalnce we will cause by imputation 
## we will consider only the training part 
mask = df.overall.notna()
names= []
for name , group in df_v2.loc[mask,["overall" , "profilename"]].groupby("overall"):
    x= group["profilename"].value_counts( ).index[0] 
    names.append(x)
names

In [None]:
x = np.linspace(1,5.5,10 )
for k,v in zip(x,names) :
    df_v2[df_v2["overall"] == k].profilename.fillna(value = v)

In [None]:
# for the test data we will fill with the most frequent 
df["profilename"].fillna(df["profilename"].value_counts().index[0], inplace=True)
np.sum(df.profilename.isna())

In [None]:
df_v2.loc[~ mask,["profilename"]].value_counts().index[0][0]

### B-feature discritization

#### 1-"profilename"

In [None]:
print(f"the feature profilename has cradinality of {df_v2.profilename.nunique()} ")

In [None]:
### any value has a frequency < threshould should be mapped to others in order to reduce the features when mapping from categorical to one hot encoding // dummy encoding 
x = df.profilename.copy()
x[x.replace(x.value_counts().to_dict()) < 50] = 'others_users'
x.nunique()

In [None]:
df_v2["profilename"] = x

#### 2- 'beer/name'

In [None]:
print(f"the feature profilename has cradinality of {df.beerName.nunique()} ")

In [None]:
# "beerName" after discritizing it it should be encoded with only 8 bits using get_dummies because this feature has no high feature importance we don't want all this high cardinality 
x = df.beerName.copy()
x[x.replace(x.value_counts().to_dict()) < 130] = 'others'
x.nunique()


In [None]:
df_v2["beerName"] = x

#### 3- beer/Style

In [None]:
## this was one the most important features extracted by Random forest regressor so we don't wnat to discretize it and it's not high cardinality 
df_v2.beerstyle.nunique()

In [None]:
mask = df["beerstyle"] == df["beerName"]
print(f"there is {df.loc[mask ,['beerstyle','beerName'] ].nunique()} redundant unique values between the 2 features")
len(df.loc[mask ,["beerstyle" , "beerName"] ])/len(df["beerstyle"])

### C- Feature Transformation [Categorical to Numerical]

In [None]:
# we will exclude "text" attribute for now hence it needs other type of transformation 
cat_col = [ 'beerstyle' , 'profilename' , "beerName"  ]
df_one_h = pd.get_dummies(df_v2,columns= cat_col , drop_first=True) #we set drop_first to true to remove the orininagl encoded columns
df_one_h.shape

### D- Textual Semantic Analysis

#### 1- Handleing Missing Values 

In [None]:
# we found very small number of missing reviews instead of removing the rows (extreme) we decide to fill with the most frequent review's text
p =  df_v2['text'].isnull().sum()/len(df['text'])
print(f"the percentage of nulls = {p}" )
print(f"number of null values is {df_v2['text'].isnull().sum()}")

In [None]:
df_v2.columns

In [None]:
x= df_v2.text.value_counts().index[0]
df_v2.text.fillna(value = x , inplace = True)

In [None]:
df_v2.isna().sum()

#### 2-Review Text length study 

In [None]:
# first we can study the relation between the length of the review and the value of the overall review score :
# we create a new column with the lenght of the text 

df_one_h["text_length"] = df_one_h.text.str.len()

In [None]:
df_one_h["text_length"].fillna(method= "ffill" , inplace=True)
df_one_h["text_length"].isna().sum()

In [None]:
plot_lenght_overall = df_one_h[["text_length" , "overall"]].copy(deep=True)
plot_lenght_overall.head(5)


In [None]:
plot_lenght_overall['text_length'] = plot_lenght_overall.text_length.apply(lambda x : np.log(x))

In [None]:
plot_lenght_overall.head(5)

In [None]:
sns.boxenplot( x = 'overall',  y='text_length', data = plot_lenght_overall)
plt.show()
plt.savefig(" Text)Length.png", format="png")


In [None]:
## The analysis shows that this feature probably not useful as it is almost have similar distribuation in all review scores 
df_one_h.drop(labels=['text_length'] , axis='columns', inplace=True)

#### 3- Text Cleaning 


In [None]:
##### we used iterative approch to select stop words 
from nltk.corpus import stopwords as sw
my_stopwords = ["tA" , "this","that" , "there" "thi" , "dtype" , "man" , "one" , "frien" , "beer","text" , "text" , "Ba" , "says" , "object" , "call" ,   "12oz" ,   "tap" , "two" ,"Name" \
                  ,"single" , "Thanks" , "Got" ,"Length" , "tap" , "oz" , "general" , "tan" ,"baby" ,"basically" , "RR" , "Th" , "nearly" , "see" , "close" , "November" ,"review"  \
                  , "called" ,"held" ,"wh" , "notes" , "Sam" , "BA" ,"sma" ,"san" , "br" , "name" , "visited" ,"review" , "bottle" , "with","been" , "that" , "pour" , "poured" , "pours" , "this" ]
    
stopwords =  set(sw.words('english'))
stopwords.update(set(my_stopwords))   

In [None]:
# we choose WordNetLemmatizer as a tokenizer to extract tokens from text reviews 
lemmatizer = WordNetLemmatizer()

In [None]:
df_one_h["cleaned_Text"] = df_one_h.text.apply(lambda text :cleaining_Text(lemmatizer ,text) )

#### 4-Study the most frequent words for poistive , Nutural and Negative Reviews  and update stop words

In [None]:
## Study the most frequent words for poistive , Nutural and Negative Reviews 
def mapping (x):
    if x >= 4  :
        return 'Positive'
    if x <= 2 :
        return 'Negative'
    else :
        return 'Nutural' 
    


In [None]:
word_Cloud_df = df_one_h[["cleaned_Text" , "overall"]].copy(deep=True)
word_Cloud_df['overall'] =  word_Cloud_df.overall.apply(mapping)
word_Cloud_df.overall.value_counts()

In [None]:
Positive = " ".join(review for review in word_Cloud_df[word_Cloud_df["overall"]=="Positive"].cleaned_Text)
Nutural = " ".join(review for review in word_Cloud_df[word_Cloud_df["overall"]=="Nutural"].cleaned_Text)
Negative = " ".join(review for review in word_Cloud_df[word_Cloud_df["overall"]=="Negative"].cleaned_Text)

In [None]:

show_wordcloud(Positive, 'Positive'  )

In [None]:
show_wordcloud (Nutural, 'Nutural' )

In [None]:
show_wordcloud (Negative, 'Negative' )

### 5-Tf-Idf Vectorizer and feature transformation

In [None]:
# since we are interesting in the rare words wich are probably the ones make the quality value higher we set use_idf=True

vectorizer = TfidfVectorizer(stop_words=stopwords ,use_idf=True, norm=False , smooth_idf=True, lowercase=True ,ngram_range=(1,2))


In [None]:
# We need to pass the feature's documents of df_one_h["cleaned_Text"] column to the vectorizer 
# after fitting the data will Transform a count matrix to a normalized tf or tf-idf representation 

docs = df_one_h["cleaned_Text"]
tfidf_vectorizer_vectors = vectorizer.fit_transform(docs) # Return Document-term matrix


In [None]:
# the shape of the matrix (100000, 62641) that means :
# 100000 rows (vectors in this case) each vector /row represnts a document
# 62641 columns represents the terms (words) , in case of (1,1)n grams

tfidf_vectorizer_vectors.shape


In [None]:
# # we decide to select the 100/250 /500 /1K / 10K most popular words to represent an input features ( applying feature reduction)

# # we defined a function so we can apply different values of N and evaluate the perfromance 

In [None]:
# freq_10k = selecting_terms(10000 , tfidf_vectorizer_vectors, vectorizer=vectorizer)
# freq_5k = selecting_terms(5000 , tfidf_vectorizer_vectors, vectorizer=vectorizer)
# freq_1k = selecting_terms(1000,tfidf_vectorizer_vectors, vectorizer=vectorizer)
freq_750 = selecting_terms(750 , tfidf_vectorizer_vectors, vectorizer=vectorizer)
# freq_500 = selecting_terms(500 , tfidf_vectorizer_vectors, vectorizer=vectorizer)
# freq_250 = selecting_terms(250 , tfidf_vectorizer_vectors, vectorizer=vectorizer)
# freq_100 = selecting_terms(100 , tfidf_vectorizer_vectors, vectorizer=vectorizer)

In [None]:
# freq_500[0:60]

In [None]:
from packages.utils13 import word_dataframe
# freq_1k_df = word_dataframe(freq_1k, tfidf_vectorizer_vectors ,vectorizer , df_one_h)
freq_750_df = word_dataframe(freq_750, tfidf_vectorizer_vectors ,vectorizer , df_one_h)
# freq_500_df = word_dataframe(freq_500, tfidf_vectorizer_vectors ,vectorizer , df_one_h)
# freq_250_df = word_dataframe(freq_250, tfidf_vectorizer_vectors ,vectorizer , df_one_h)
# freq_100_df = word_dataframe(freq_100, tfidf_vectorizer_vectors ,vectorizer , df_one_h)


In [None]:
### first we make sure the output is as excpected 
print(freq_250_df.shape)
print(sum(df_one_h.index == freq_250_df.index))

#### creating Word Cloud For Tf-Idf 

In [None]:
cop = freq_250_df.transpose(copy=True)
cop.shape

In [None]:
tf_df = cop.sum(axis="columns")
tf_df = tf_df.sort_values( ascending = False)
tf_df[:20]

In [None]:
wc = WordCloud(background_color='white',stopwords=stopwords, max_words=100,max_font_size=40, scale=3,random_state=1 )
wc.generate_from_frequencies(tf_df)

fig = plt.figure(1, figsize=(12, 12))
plt.axis('off') 
fig.suptitle("TF-IDF WORDCLOUD", fontsize=20)
fig.subplots_adjust(top=2.3)

plt.imshow(wc)
plt.show()

In [None]:
## then we concatinate the 2 data frames 
# df_1K = pd.concat([df_one_h , freq_1k_df] , axis=1 )
df_750 = pd.concat([df_one_h , freq_750_df] , axis=1 )
# df_500 = pd.concat([df_one_h , freq_500_df] , axis=1 )
# df_250 = pd.concat([df_one_h , freq_250_df] , axis=1 )
# df_100 = pd.concat([df_one_h , freq_100_df] , axis=1 )

#### 3- Training with Textual Feartures

In [None]:
Dropped_Columns = [ 'text', 'user/birthdayRaw','user/birthdayUnix' , "cleaned_Text"  ] 
X_train, X_valid, y_train, y_valid ,X_train_valid ,y_train_valid,X_test,y_test,feature_names, index = train_val_test( Dropped_Columns ,df_750)


##### A-training models baselines  

In [None]:
# # # Lasso Default :
lasso_default = Lasso( tol = 0.001 , alpha = 0.001)
make_regression(lasso_default ,X_train , y_train  ,X_valid, y_valid )

# Ridge Regularization model:
ridge_default =  Ridge()
make_regression(ridge_default ,X_train , y_train ,X_valid, y_valid  )

# RANDOM FOREST REGRESSOR 
RF_default = RandomForestRegressor(random_state=42 , n_jobs=-1)
make_regression(RF_default ,X_train , y_train , X_valid, y_valid )

#### Predection of Basline Models on Test Data :

In [None]:
print_results(RF_default ,X_train_valid, y_train_valid,X_test, "RandomForest_BEST_with_Text",index)
print_results(lasso_default ,X_train_valid, y_train_valid,X_test ,"LASSO_BEST_with_Text", index)
print_results(ridge_default,X_train_valid, y_train_valid,X_test , "Ridge_BEST_with_Text" , index)


### Randomize Grid Search Results :

In [None]:
lasso_d = Lasso()
Lasso_best = RandomizeGridSearch(lasso_d ,lasso_grid ,X_train_valid , y_train_valid, n_iter=100 ,cv=10)

In [None]:
ridge = Ridge()
Ridge_best = RandomizeGridSearch ( ridge , Ridge_random_grid ,X_train_valid , y_train_valid,100 ,10)

In [None]:
RF = RandomForestRegressor(random_state=42 )
RF_best = RandomizeGridSearch (RF , RF_grid ,X_train_valid , y_train_valid,100 ,3) 

##### compute the feature importance by the regressor 

In [None]:
# # feature_names = df_drop[train_valid_mask].drop(columns=["overall"]).columns # extract the features from the dev set 
sorted(zip(feature_names, RF_default. feature_importances_), key=lambda x: x[1],reverse=True)[:100] # sorting the feature descending 

#### B-Predections of best Models on Test Data :

In [None]:
print_results(RF_best ,X_train_valid, y_train_valid,X_test, "RandomForest_BEST_with_Textg",index)
print_results(Ridge_best ,X_train_valid, y_train_valid,X_test ,"LASSO_BEST_with_Textg", index)
print_results(Lasso_best,X_train_valid, y_train_valid,X_test , "Ridge_BEST_with_Textg" , index)