## How to extend the appearance of Jupyter Notebook

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

## Extention of the rows and columns depiction

In [None]:
pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 600)
pd.set_option('display.width', 1000)

## Renaming Columns Based on _ and Lowercase Rules

In [None]:
def cleanup_column_names(df,rename_dict={},do_inplace=True):
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

## Concatenating the Title and Review Text Columns (Based on Either Non-Null Values)

In [None]:
df2 = df[df.title.notnull() | df.review_text.notnull()]
df2.review_text.astype(str)
df2.title.astype(str)
df2['new_text'] = df2[['title', 'review_text']].apply(lambda x: ' '.join(str(y) for y in x if str(y) !='nan'), axis=1)
df2.drop('title', axis = 1, inplace = True)
df2.head()

## Missing Values Table

In [None]:
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

## Missing Value Plot

In [None]:
plt.figure(figsize = (10,7))
sns.set()
_ = sns.heatmap(df.isnull(),yticklabels=False, cbar = False, cmap = 'viridis')

## String Manipulation

In [None]:
df['pickup_city'] = df['pickup_city'].apply(lambda x: x.rstrip(' ') if x.endswith(' ') else x)

In [None]:
df = df[~df['estimated_ship_date'].str.contains('N')]

In [None]:
df[~df['estimated_ship_date'].str.startswith('20') | df['estimated_ship_date'].str.startswith('202')]

In [None]:
df2 = df[df.type.isin(['Car','SUV','Pickup','Motorcycle','Van'])]

In [None]:
df['dropoff_zip'] = df['dropoff_zip'].str.extract('(\d{5})', expand=True)

In [None]:
df = df[pd.notnull(df['pickup_zip'])]

In [None]:
df.__len__()

In [None]:
df[df['pickup_city'].str.contains('0'or'1'or'2' or '3' or '4' or '5' or '6' or '7' or '8' or '9')].__len__()

In [None]:
zipped_list = list(zip(df['pickup_zip'], df['dropoff_zip']))

In [None]:
df1 = df1.apply(lambda x:x.astype(str).str.lower())

In [None]:
df['make'].replace(r'Old.*', 'Oldsmobile', inplace=True, regex=True)

In [None]:
vals_to_replace = {'chevolet':'chevrolet', 'chevypickup':'chevrolet', 'chev':'chevrolet',
                  'chev.':'chevrolet', 'chevy`':'chevrolet', 'chevevolet':'chevrolet','cheverelot':'chevrolet'}
df = df.replace({'make': vals_to_replace})

In [None]:
df.drop(df[df['estimated_ship_date'].str.contains('N')].index, axis=0, inplace=True)

In [None]:
df.year = pd.to_numeric(df.year, errors = 'coerce')

In [None]:
df2.sample(n=10)

In [None]:
df.index = pd.RangeIndex(len(df.index))

df.index = range(len(df.index))

In [None]:
df = df.groupby("make").filter(lambda x: len(x) >= 10)

In [None]:
df.make.replace({'chevroletrolet': 'chevrolet'}, inplace=True, regex=True)

In [None]:
df4 = df1.merge(df, on='install_at_site_use_id', how='inner')

In [None]:
df = df.rename(columns={'location': 'install_at_site_use_id'})

## Drop Duplicates

In [None]:
DG=df.groupby(['A', 'C'])   
pd.concat([DG.get_group(item) for item, value in DG.groups.items() if len(value)==1])

In [None]:
df.drop_duplicates(keep=False, inplace=True)

## First character is alpha or not

In [None]:
df2['host_id'] = df2['host_id'].apply(lambda x: x if x[0].isalpha() else np.nan)

## Importing multiple csv files at once

In [None]:
import glob
path =r'C:\Users\mkadiogl\Desktop\Licensing\Cleaned_1M_databases' # use your path
allFiles = glob.glob(path + "/*.csv")

list_ = []

for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    list_.append(df)

frame = pd.concat(list_, axis = 0, ignore_index = True)

# Importing Multiple Excel Files at Once

In [None]:
df_h_id = pd.DataFrame()
for f in ['install_at_site_{}.xls'.format(i) for i in range(1,75)]:
    data = pd.read_excel(f, sheet_name="Sheet 1")
    df_h_id = df_h_id.append(data)

## Merging Two Columns with ',' Seperator

In [None]:
df['dropoff_comp'] = df['dropoff_city'].str.cat(df['dropoff_state_code'], sep=',')

## Lambda If Application

In [None]:
df2['rating_class'] = df2['rating'].apply(lambda x: 'bad' if x < 3 else('good' if x > 3 else 'neutral'))

## API Implementation

In [None]:
list2 = list()
for x in df2.city_dist:
    try:  
        pick = x.split('-')[0]
        drop = x.split('-')[1]    
    #print (pick , drop)
        url = "https://maps.googleapis.com/maps/api/distancematrix/json?units=imperial&origins={}&destinations={}&key={}".format(pick,drop,api)
        #print (url)
        res = urllib.request.urlopen(url).read()
        data = json.loads(res.decode())
        #print (data)
        #print(data["rows"][0]["elements"][0]["distance"]["value"]/1000)
        ##df['distance'][index] = int(data["rows"][0]["elements"][0]["distance"]["text"].split()[0].replace(',',''))
        ##df['distance'][index] = int(data["rows"][0]["elements"][0]["distance"]["value"])
        list2.append((data["rows"][0]["elements"][0]["distance"]["text"]))
    except:
        #df['distance'][index] = (np.nan)
        list2.append(0)

# Dateutil - pip install python-dateutil

In [None]:
from dateutil.relativedelta import *
from dateutil.easter import *
from dateutil.rrule import *
from dateutil.parser import *
from datetime import *
now = parse("Sat Oct 11 17:13:46 UTC 2003")
today = now.date()
year = rrule(YEARLY,dtstart=now,bymonth=8,bymonthday=13,byweekday=FR)[0].year
rdelta = relativedelta(easter(year), today)
print("Today is: %s" % today)
print("Year with next Aug 13th on a Friday is: %s" % year)
print("How far is the Easter of that year: %s" % rdelta)
print("And the Easter of that year is: %s" % (today+rdelta))

## Creating a DF With List

In [None]:
df3 = pd.DataFrame(
    {'year': year,
     'model': model,
     'make': make,
     'type': type1
    })

## Datetime and Calender

In [None]:
import datetime as dt
import calendar

df.event_time=pd.to_datetime(df.estimated_ship_date, errors='coerce')
df['event_year'] = df.event_time.map(lambda x: x.year)
df['event_month'] = df.event_time.map(lambda x: x.month)
df['event_month_name'] = df['event_month'].apply(lambda x: calendar.month_name[int(x)] if not np.isnan(x) else np.nan)
df['event_day'] = df.event_time.map(lambda x: x.day)
df['event_day_of_week'] = df.event_time.dt.weekday_name
df['event_day_of_week_number'] = df.event_time.dt.weekday
df['event_hour'] = df.event_time.dt.hour

# Installing Any Model

In [None]:
import sys
!{sys.executable} -m pip install inflect

## Plotly

In [None]:
plotly.tools.set_credentials_file(username='', api_key='')

### Converting the notebook to html

In [None]:
jupyter nbconvert --to html mynotebook.ipynb

# Importing an Image

In [None]:
from IPython.display import Image
img = 'TF_IDF_3_class.jpg'
Image(filename=img)

# Genetic Programming (TPOT)

## Regressor

In [None]:
tpot = TPOTRegressor(max_time_minutes = 60,
                     generations=5, 
                     population_size=20, 
                     verbosity=2,
                    n_jobs = -1,
                    cv = 5)
tpot.fit(X_scaled, y_train)
print(tpot.score(X_test_scaled, y_test))
#tpot.export('tpot_boston_pipeline.py')

In [None]:
# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)

exported_pipeline = GradientBoostingRegressor(alpha=0.85, learning_rate=0.1, loss="ls",
                                              max_features=0.9, min_samples_leaf=5,
                                              min_samples_split=6)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

## Classifier

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# NOTE: Make sure that the class is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1).values
training_features, testing_features, training_target, testing_target = \
            train_test_split(features, tpot_data['target'].values, random_state=None)


exported_pipeline = KNeighborsClassifier(n_neighbors=6, weights="distance")

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

# NLP Normalize Corpus

In [None]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

# Auto ML and Pickle

In [None]:
from auto_ml import Predictor
#from catboost import CatBoostRegressor
import pickle

column_descriptions = {
  'price_total': 'output', 'pickup_city': 'categorical', 'dropoff_city': 'categorical', 
    'pickup_state_code': 'categorical', 'dropoff_state_code': 'categorical', 'make': 'categorical', 
    'model': 'categorical'} #  no need for 'type': 'categorical'


types = df["type"].unique()

#types=["Car"]

for t in types: # this would be parallelised later
    
    temp_df = df[df["type"] == t].drop(["type"], axis=1)

    #df_train, df_test = train_test_split(temp_df, test_size=0.2)

    ml_predictor = Predictor(type_of_estimator='regressor', column_descriptions=column_descriptions)

    print ('training started for vehicle type >> ', t)
    
    ml_predictor.train(temp_df, model_names='GradientBoostingRegressor', cv=5)
    
    score = r2_score(temp_df.price_total, ml_predictor.predict(temp_df))

    print ('score for {} is {}'.format(t, round(score,4)))
    
    filename = './models_by_types1/GB_model_{}_{}.ml'.format(t, score)
    
    pickle.dump(ml_predictor, open(filename, 'wb'))

# Load Pickle File

In [None]:
model_name="./models_by_types1/GB_model_Car_0.9009313891926389.ml"

import pickle

loaded_model = pickle.load(open(model_name, 'rb'))

temp_df=df[df["type"] == "Car"].drop(["type"], axis=1)

loaded_model.predict(temp_df[:10])

# API_Auto_ML_Creation

In [None]:
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
import pickle
import flask

#model_name="./models_by_types1/GB_model_Car_0.9009313891926389.ml"

app = flask.Flask(__name__)

#getting our trained model from a file we created earlier
#model = pickle.load(open(model_name,"rb"), encoding ='utf-8')
model_name="./full_model/GB_model_0.9024795449595084.ml"

loaded_model = pickle.load(open(model_name, 'rb'))

#loaded_model.predict(df1[10:20])

@app.route('/predict', methods=['POST','GET'])
def predict():
    try:
        feature_array = flask.request.get_json(silent=True)
        
        print (feature_array)   
    except Exception as e:
        
        return jsonify({'success':False, "error": str(e)})
    
    df1=pd.DataFrame([feature_array], columns=feature_array.keys())    
    
    # convert this json to dataframe
    prediction = loaded_model.predict(df1)
    
    return flask.jsonify({'success':True, "predictions": prediction})

if __name__ == '__main__':
    run_simple('192.168.1.62', 8081, app)

# API Request

In [None]:
import json
payload=json.loads(df.iloc[9].to_json())
api_call=requests.post("http://192.168.1.62:8081/predict", json=payload)
json.loads(api_call.text)

In [None]:
print(payload)

In [None]:
api_call.text

# Creating a New Folder

In [None]:
#!mkdir filename

# Google Distance Matrix API

In [None]:
import urllib.request
import json
api = 'AIzaSyAFfa7OW7CLVZ7ZSP36avc1f8BQudNDlOM'
pick = 
drop = 
url = "https://maps.googleapis.com/maps/api/distancematrix/json?units=imperial&origins={}&destinations={}&key={}".format(pick,drop,api)
    #print (url)
res = urllib.request.urlopen(url).read()
data = json.loads(res.decode())
    #print (data)
    #print(data["rows"][0]["elements"][0]["distance"]["text"])
print(data["rows"][0]["elements"][0]["distance"]["text"])

# CATBOOST API 

In [None]:
ml_predictor = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.05, loss_function='RMSE', eval_metric='R2')

#print ("training started for vehicle type >> ", t)
    
ml_predictor.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_test, y_test),plot=False) 

score = r2_score(y_test, ml_predictor.predict(X_test))
    
print ("score is {}".format(round(score,4)))
filename = './model_catboost/CB_model_{}.ml'.format(score)
ml_predictor.save_model(filename)

In [None]:
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
import pickle
import flask
import urllib.request
import json
import pandas as pd
import dill
from catboost import CatBoostRegressor
import datetime as dt
import calendar

#model_name="./models_by_types1/GB_model_Car_0.9009313891926389.ml"

app = flask.Flask(__name__)

#getting our trained model from a file we created earlier
#model = pickle.load(open(model_name,"rb"), encoding ='utf-8')
ml_predictor = CatBoostRegressor(iterations=1000, depth=10, learning_rate=0.05, loss_function='RMSE', eval_metric='R2')

model_name="./model_catboost/CB_model_0.8825854610673416.coreml"

ml_predictor.load_model(model_name)

#loaded_model = pickle.load(open(model_name, 'rb'))

def distance (pickup_city, pickup_state, dropoff_city, dropoff_state):
    
    pickup_city = pickup_city.replace(' ', '+') if ' ' in pickup_city else pickup_city 
       
    pick = pickup_city + ',' + pickup_state
    
    dropoff_city = dropoff_city.replace(' ', '+') if ' ' in dropoff_city else dropoff_city
    
    drop = dropoff_city + ',' + dropoff_state 
    
    api = 'AIzaSyAFfa7OW7CLVZ7ZSP36avc1f8BQudNDlOM'

    try:
        url = "https://maps.googleapis.com/maps/api/distancematrix/json?units=imperial&origins={}&destinations={}&key={}".format(pick, drop, api)
        
        res = urllib.request.urlopen(url).read()
        
        data = json.loads(res.decode())
    
        distance = round((data["rows"][0]["elements"][0]["distance"]["value"]/1609),2)
    except:
        distance =0
    
    return distance


@app.route('/predict', methods=['POST','GET'])

def predict():
    try:
        feature_array = flask.request.get_json()
        
        print (feature_array)   
    except Exception as e:
        
        return jsonify({'success':False, "error": str(e)})
  
    
    feature_array['make']=feature_array['make'].lower()
    
    feature_array['model']= feature_array['make'].lower()

    feature_array["distance"] = distance (feature_array['pickup_city'],
                                          feature_array['pickup_state_code'],
                                          feature_array['dropoff_city'],
                                          feature_array['dropoff_state_code'])
    
    if feature_array["distance"] == 0:
        
        return flask.jsonify({'success':False, "error": "something wrong with Google matrix distance API"})
    
    df=pd.DataFrame([feature_array], columns=feature_array.keys()) 
    
    df.estimated_ship_date=pd.to_datetime(df.estimated_ship_date, errors='coerce')
    df['event_year'] = df.estimated_ship_date.map(lambda x: x.year)
    df['event_month'] = df.estimated_ship_date.map(lambda x: x.month)
    df['event_month_name'] = df['event_month'].apply(lambda x: calendar.month_name[int(x)] if not np.isnan(x) else np.nan)
    df['event_day'] = df.estimated_ship_date.map(lambda x: x.day)
    df['event_day_of_week_number'] = df.estimated_ship_date.dt.weekday
    
    #df.estimated_ship_date=pd.to_datetime(df.estimated_ship_date, errors="coerce")
    df=df[['pickup_state_code', 'dropoff_state_code', 'vehicle_runs', 'make', 'model', 'type', 'year', 'diesel_price', 
           'event_year', 'event_month_name', 'event_day', 
           'event_day_of_week_number','distance','pickup_city', 'dropoff_city', 'ship_via_id']]
    
    print (df)
    df.to_csv('df_xx.csv', index=False)
    
    # convert this json to dataframe,
    prediction = ml_predictor.predict(df)

    return flask.jsonify({'success':True, "predictions": prediction[0]})


if __name__ == '__main__':
     run_simple('192.168.1.62', 8081, app)

# API_H2O

In [None]:
from werkzeug.wrappers import Request, Response
from werkzeug.serving import run_simple
import pickle
import flask
import urllib.request
import json
import pandas as pd
import dill
import h2o


#model_name="./models_by_types1/GB_model_Car_0.9009313891926389.ml"

app = flask.Flask(__name__)

#getting our trained model from a file we created earlier
#model = pickle.load(open(model_name,"rb"), encoding ='utf-8')

model_path = (r"C:\Users\Mike\Desktop\Truck\full_model_H2O\modell_h2o\dnn_default")

saved_model = h2o.load_model(model_path)



#loaded_model.predict(df1[10:20])

def distance (pickup_city, pickup_state_code, dropoff_city, dropoff_state_code):
    
    pickup_city = pickup_city.replace(' ', '+') if ' ' in pickup_city else pickup_city 
       
    pick = pickup_city + ',' + pickup_state_code
    
    dropoff_city = dropoff_city.replace(' ', '+') if ' ' in dropoff_city else dropoff_city
    
    drop = dropoff_city + ',' + dropoff_state_code
    
    api = 'AIzaSyAFfa7OW7CLVZ7ZSP36avc1f8BQudNDlOM'

    try:
        url = "https://maps.googleapis.com/maps/api/distancematrix/json?units=imperial&origins={}&destinations={}&key={}".format(pick, drop, api)

        res = urllib.request.urlopen(url).read()
    
        data = json.loads(res.decode())
    
        distance = round((data["rows"][0]["elements"][0]["distance"]["value"]/1609),2)
        print (distance)
    
    except:
        
        distance = 0
        
    return distance

@app.route('/predict', methods=['POST','GET'])

def predict():
    try:
        feature_array = flask.request.get_json()
        
        print (feature_array)   
    
    except Exception as e:
        
        return jsonify({'success':False, "error": str(e)})

    feature_array["distance"] = distance (feature_array['pickup_city'],
                                          feature_array['pickup_state_code'],
                                          feature_array['dropoff_city'],
                                          feature_array['dropoff_state_code'])
    if feature_array["distance"] == 0:
        
        return flask.jsonify({'success':False, "error": "something wrong with Google matrix distance API"})

    df1=pd.DataFrame([feature_array], columns=feature_array.keys())    
    
    hf = h2o.H2OFrame(df1)
    
    # convert this json to dataframe,
    
    prediction = saved_model.predict(hf)
    
    print (type(prediction))
    prediction = prediction.as_data_frame().to_json()
    print (type(prediction))
    
    return flask.jsonify({'success':True, "predictions": prediction})

if __name__ == '__main__':
    run_simple('192.168.1.62', 8081, app)

In [None]:
# Import matplotlib.pyplot
import matplotlib.pyplot as plt

# Calculate number of unique values for each label: num_unique_labels
num_unique_labels =df[LABELS].apply(pd.Series.nunique)

# Plot number of unique values for each label
num_unique_labels.plot(kind='bar')

# Label the axes
plt.xlabel('Labels')
plt.ylabel('Number of unique values')

# Display the plot
plt.show()

In [None]:
# Create the new DataFrame: numeric_data_only
numeric_data_only = df[NUMERIC_COLUMNS].fillna(-1000)

# Get labels and convert to dummy variables: label_dummies
label_dummies =pd.get_dummies(df[LABELS])

# Create training and test sets
X_train, X_test, y_train, y_test = multilabel_train_test_split(numeric_data_only,
                                                               label_dummies,
                                                               size=0.2, 
                                                               seed=123)

# Print the info
print("X_train info:")
print(X_train.info())
print("\nX_test info:")  
print(X_test.info())
print("\ny_train info:")  
print(y_train.info())
print("\ny_test info:")  
print(y_test.info()) 


In [None]:
# Instantiate the classifier: clf
clf = OneVsRestClassifier(LogisticRegression())

# Fit it to the training data
clf.fit(X_train, y_train)

# Load the holdout data: holdout
holdout = pd.read_csv('HoldoutData.csv', index_col=0)

# Generate predictions: predictions
predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000))

In [None]:
# Generate predictions: predictions
predictions = clf.predict_proba(holdout[NUMERIC_COLUMNS].fillna(-1000))

# Format predictions in DataFrame: prediction_df
prediction_df = pd.DataFrame(columns=pd.get_dummies(df[LABELS]).columns,
                             index=holdout.index,
                             data=predictions)


# Save prediction_df to csv
prediction_df.to_csv('predictions.csv')

# Submit the predictions for scoring: score
score = score_submission(pred_path='predictions.csv')

# Print score
print('Your model, trained with numeric data only, yields logloss score: {}'.format(score))

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Fill missing values in df.Position_Extra
df.Position_Extra.fillna('', inplace=True)

# Instantiate the CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern = TOKENS_ALPHANUMERIC)

# Fit to the data
vec_alphanumeric.fit(df.Position_Extra)

# Print the number of tokens and first 15 tokens
msg = "There are {} tokens in Position_Extra if we split on non-alpha numeric"
print(msg.format(len(vec_alphanumeric.get_feature_names())))
print(vec_alphanumeric.get_feature_names()[:15])

In [None]:
# Define combine_text_columns()
def combine_text_columns(data_frame, to_drop=NUMERIC_COLUMNS + LABELS):
    """ converts all text in each row of data_frame to single vector """
    
    # Drop non-text columns that are in the df
    to_drop = set(to_drop) & set(data_frame.columns.tolist())
    text_data = data_frame.drop(to_drop, axis=1)
    
    # Replace nans with blanks
    text_data.fillna('', inplace=True)
    
    # Join all text items in a row that have a space in between
    return text_data.apply(lambda x: " ".join(x), axis=1)

In [None]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create the basic token pattern
TOKENS_BASIC = '\\S+(?=\\s+)'

# Create the alphanumeric token pattern
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'

# Instantiate basic CountVectorizer: vec_basic
vec_basic = CountVectorizer(token_pattern=TOKENS_BASIC)

# Instantiate alphanumeric CountVectorizer: vec_alphanumeric
vec_alphanumeric = CountVectorizer(token_pattern=TOKENS_ALPHANUMERIC)

# Create the text vector
text_vector = combine_text_columns(df)

# Fit and transform vec_basic
vec_basic.fit_transform(text_vector)

# Print number of tokens of vec_basic
print("There are {} tokens in the dataset".format(len(vec_basic.get_feature_names())))

# Fit and transform vec_alphanumeric
vec_alphanumeric.fit_transform(text_vector)

# Print number of tokens of vec_alphanumeric
print("There are {} alpha-numeric tokens in the dataset".format(len(vec_alphanumeric.get_feature_names())))

In [None]:
# Import Pipeline
from sklearn.pipeline import Pipeline

# Import other necessary modules
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

# Split and select numeric data only, no nans 
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit the pipeline to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - numeric, no nans: ", accuracy)

In [None]:
# Import the Imputer object
from sklearn.preprocessing import Imputer

# Create training and test sets using only numeric data
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

# Insantiate Pipeline object: pl
pl = Pipeline([
        ('imp', Imputer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit the pipeline to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - all numeric, incl nans: ", accuracy)

In [None]:
# Import the CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Split out only the text data
X_train, X_test, y_train, y_test = train_test_split(sample_df['text'],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=456)

# Instantiate Pipeline object: pl
pl = Pipeline([
        ('vec', CountVectorizer()),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])

# Fit to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test,y_test)
print("\nAccuracy on sample data - just text data: ", accuracy)

In [None]:
# Import FunctionTransformer
from sklearn.preprocessing import FunctionTransformer

# Obtain the text data: get_text_data
get_text_data = FunctionTransformer(lambda x: x['text'], validate=False)

# Obtain the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False)

# Fit and transform the text data: just_text_data
just_text_data = get_text_data.fit_transform(sample_df)

# Fit and transform the numeric data: just_numeric_data
just_numeric_data = get_numeric_data.fit_transform(sample_df)

# Print head to check results
print('Text Data')
print(just_text_data.head())
print('\nNumeric Data')
print(just_numeric_data.head())

In [None]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Split using ALL data in sample_df
X_train, X_test, y_train, y_test = train_test_split(sample_df[['numeric', 'with_missing', 'text']],
                                                    pd.get_dummies(sample_df['label']), 
                                                    random_state=22)

# Create a FeatureUnion with nested pipeline: process_and_join_features
process_and_join_features = FeatureUnion(
            transformer_list = [
                ('numeric_features', Pipeline([
                    ('selector', get_numeric_data),
                    ('imputer', Imputer())
                ])),
                ('text_features', Pipeline([
                    ('selector', get_text_data),
                    ('vectorizer', CountVectorizer())
                ]))
             ]
        )

# Instantiate nested pipeline: pl
pl = Pipeline([
        ('union', process_and_join_features),
        ('clf', OneVsRestClassifier(LogisticRegression()))
    ])


# Fit pl to the training data
pl.fit(X_train, y_train)

# Compute and print accuracy
accuracy = pl.score(X_test, y_test)
print("\nAccuracy on sample data - all data: ", accuracy)


In [None]:
def fxy(x, y):
    return x if x == 'SideA' else y

intra['newcolumn'] = intra[['SideADeaths','SideBDeaths']].astype(str).apply(lambda x: fxy(x['SideADeaths'], x['SideBDeaths']), axis=1)

In [None]:
# https://towardsdatascience.com/bringing-the-best-out-of-jupyter-notebooks-for-data-science-f0871519ca29