In [9]:
import pandas as pd
import numpy as np

# regular expressione and tools for sentiment analysis
from textblob import TextBlob
import re 

import pickle # to save the model



In [10]:
def dummification(df):
    #_______________ Categorical Dummification
    categorical = df.select_dtypes(exclude=np.number).columns.tolist()
    df = pd.get_dummies(df, columns = categorical, prefix = categorical)
    
    return df

In [11]:
# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower()
    text = re.sub('@[A-Za-z0–9]+', '', text) #Removing @mentions
    text = re.sub('#', '', text) # Removing '#' hash tag
    text = re.sub('RT[\s]+', '', text) # Removing RT
    text = re.sub('https?:\/\/\S+', '', text) # Removing hyperlink
    text = re.sub('&amp;', '', text) #remove ampersand
    text = re.sub('\n',' ', text) #remove breakline
    text = re.sub("[^\w]", " ",  text) #remove all distinct to word
    text = re.sub('å', '', text)
    text = re.sub('ä', '', text)
    text = re.sub('ā', '', text)


    text = re.sub(' +', ' ', text) #remove multiple spaces

    # clean emoticons
    return text

In [15]:
def cleanData(df_crm, df_finance, df_sales, df_twitter):
    
    # column id renaming to join the datasets
    df_crm = df_crm.rename(columns={'ID_CRM': 'id'}) 
    df_finance = df_finance.rename(columns={'ID_FINANCE': 'id'})
    df_sales = df_sales.rename(columns={'ID_SALES': 'id'})
    df_twitter = df_twitter.rename(columns={'ID_SALES': 'id'})
    
    
    # df_crm unique_id extraction
    df_crm.id = np.where(df_crm.Income_Level.str.len() == 2,\
                     df_crm.id.str.slice(start = 1, stop = -2),\
                     df_crm.id.str.slice(start = 1, stop = -1))

    # df_finance unique_id extraction
    df_finance.id = np.where(df_finance.Special_Pay.str.len() == 2,\
                         df_finance.id.str.slice(start = 2),\
                         df_finance.id.str.slice(start = 0))

    # df_sales unique_id extraction
    df_sales.id = np.where(df_sales.Program_Code.str.len() == 2,\
                       df_sales.id.str.slice(start = 2, stop = -1),\
                       df_sales.id.str.slice(start = 3, stop = -1))

    # df_twitter unique_id extraction
    df_twitter.id = df_twitter.id.str.extract(r'(\d+)', expand=False)
    
    
    df_twitter['text'] = df_twitter['text'].apply(cleanTxt)
    #get the sentiment of each tweet first, then calculate the mean

    df_twitter_merged = df_twitter.copy()
    df_twitter_merged['polarity'] = df_twitter_merged['text'].apply(lambda x: TextBlob(x).sentiment[0])
    df_twitter_merged['subjectivity'] = df_twitter_merged['text'].apply(lambda x: TextBlob(x).sentiment[1])

    df_twitter_merged = df_twitter_merged.groupby('id').mean()
    
    # create a single table to join the datasets
    df_merged = df_sales.merge(df_crm, on="id", how="inner").merge(df_finance, on="id", how="inner").merge(df_twitter_merged, on="id", how="left")
    
    
      #______________ id
    df_merged.drop(columns=['id'], inplace=True)

    
    #______________ From_Grade
    #missing values replaced with the median
    df_merged['From_Grade'] = df_merged['From_Grade'].fillna(df_merged['From_Grade'].median())


    #______________ To_Grade
    #missing values replaced with the median (with respect to the subgroup)
    df_merged['To_Grade'] = df_merged.groupby(['From_Grade'], sort=False)['To_Grade'].apply(lambda x: x.fillna(x.median()))
    df_merged['To_Grade'] = df_merged['To_Grade'].fillna(df_merged['To_Grade'].median())


    #____**NEW**____ Delta_From_To_Grade
    #Distance from lowest to highest grade in school of participants. When zero it represents an indicator for the trip
    #taken by a group comprising students from the same grade.
    df_merged['Delta_From_To_Grade'] = df_merged['To_Grade'] - df_merged['From_Grade']

    
    #_______________ Polarity
    df_merged['polarity'].fillna(df_merged['polarity'].mean(), inplace=True)
    
    
    
    #____**NEW**____ Total_Pax_rev
    df_merged['Total_Pax_rev'] = np.log(df_merged['FPP'] + df_merged['Total_Discount_Pax'])
    
        
    df_merged = dummification(df_merged) 
    
    #Retained is the target 
    Selected_Features = ['polarity',
                     'Delta_From_To_Grade',
                     'Total_Pax_rev',
                     'SPR_Product_Type_East Coast',
                    ]
    
    return df_merged[Selected_Features]

## Execute the Model

In [17]:
## Load the model
filename = 'DecisionTreeClassifierModel.sav'
loaded_model = pickle.load(open(filename, 'rb'))

## Load datasets

path = '20210222_generali_project_test/'
missing_values = [''] #define values to be identified as NaN in the datasets
df_crm = pd.read_csv(path + 'crm_test.csv', keep_default_na=False, na_values=missing_values, decimal=',') #import CRM_test
df_finance = pd.read_csv(path + 'finance_test.csv', keep_default_na=False, na_values=missing_values, decimal=',') #import finance_test
df_sales = pd.read_csv(path + 'sales_test.csv', keep_default_na=False, na_values=missing_values, decimal=',') #import sales_test
df_twitter = pd.read_csv(path + 'twitter_test.csv', keep_default_na=False, na_values=missing_values, decimal=',') #import twitter_test

#clean data
X_test = cleanData(df_crm, df_finance, df_sales, df_twitter)

#execute the model
predictions = loaded_model.predict(X_test)

#save the result
pd.DataFrame(predictions).to_csv(path + "predictions.csv", index = False, header = False)

print('OK')

OK
