# Refining Profile Data
Adding information to the categories

### Library and Data Imports

In [30]:
import pandas as pd
import _pickle as pickle
import numpy as np
from scipy.stats import halfnorm

In [43]:
with open("mechanic_profiles.pkl",'rb') as fp:
    df = pickle.load(fp)

In [44]:
# Removing the numerical data
df = df[['Bios']]

### Creating Lists for the Categories

In [45]:
# Probability dictionary
p = {}

# Movie Genres
location = ['Jakata',
          'Sumatra',
          'Java',
          'Bali',
          'Kalimantan',
          'Palangkaraya',
          'Samarinda',
          'Banjarmasin',
          'Bandung']

p['location'] = [0.28,
               0.01,
               0.16,
               0.14,
               0.09,
               0.06,
               0.04,
               0.01, 
               0.21]

# TV Genres
Qualification = ['High School',
      'Diploma',
      'Bachelors Degree',
      'Professional Mechanic',
      'Junior Mechanic',
      'Mechanic',
      'Master',
      'Doctorate',
      'Associate']

p['Qualification'] = [0.30,
           0.23,
           0.12,
           0.12,
           0.09,
           0.08,
           0.03,
           0.02,
           0.01]

# Religions (could potentially create a spectrum)
Specialisation = ['Advanced Auto Diagnostic Techniques',
      'Advanced Vehicle Diagnostics and Management',
      'Automotive Maintenance and Repair',
      'Motor Vehicle Engineering',
      'Heavy Vehicle Service',
      'Maintenance Technian',
      'Automotive Apprenticeships',
      'Rail Apprenticeships',
      'Rail Engineering Operative',
      'Light Vehicle Service']

p['Specialisation'] = [0.16,
                 0.16,
                 0.01,
                 0.19,
                 0.11,
                 0.05,
                 0.10,
                 0.09,
                 0.07,
                 0.06]



# Age (generating random numbers based on half normal distribution)
age = halfnorm.rvs(loc=18,scale=8, size=df.shape[0]).astype(int)
year_of_experience = halfnorm.rvs(loc=3,scale=5, size=df.shape[0]).astype(int)
# Lists of Names and the list of the lists
categories = [location, Qualification, Specialisation,age,year_of_experience]

names = ['location','Qualification', 'Specialisation', 'Age',"year of experience"]

combined = dict(zip(names, categories))

### Establishing random values for each category

In [46]:
# Looping through and assigning random values
for name, cats in combined.items():
    if name in ['location',"Qualification"]:
        # Picking only 1 from the list
        df[name] = np.random.choice(cats, df.shape[0], p=p[name])
        
    elif name == 'Age' :
        # Generating random ages based on a normal distribution
        df[name] = cats
    elif name == 'year of experience' :
    # Generating random ages based on a normal distribution
        df[name] = cats
    else:
        # Picking 3 from the list 
        try:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3), p=p[name]))
        except:
            df[name] = list(np.random.choice(cats, size=(df.shape[0],1,3)))
        
        df[name] = df[name].apply(lambda x: list(set(x[0].tolist())))
        

In [47]:
df

Unnamed: 0,Bios,location,Qualification,Specialisation,Age,year of experience
0,Passionate analyst. Explorer. Hipster-friendly...,Kalimantan,High School,"[Advanced Auto Diagnostic Techniques, Automoti...",21,8
1,Twitter fanatic. Devoted web fanatic. Zombie e...,Bali,High School,"[Advanced Vehicle Diagnostics and Management, ...",29,5
2,Total alcohol practitioner. Social media buff....,Java,Bachelors Degree,"[Rail Apprenticeships, Motor Vehicle Engineering]",23,13
3,Extreme twitter advocate. Hardcore internet ju...,Jakata,Bachelors Degree,"[Automotive Apprenticeships, Motor Vehicle Eng...",20,7
4,Problem solver. Devoted introvert. Food geek. ...,Palangkaraya,High School,"[Advanced Vehicle Diagnostics and Management, ...",24,8
...,...,...,...,...,...,...
6160,Lifelong zombie junkie. Friendly travel buff. ...,Bandung,Bachelors Degree,"[Heavy Vehicle Service, Advanced Vehicle Diagn...",19,12
6161,Total introvert. Tv specialist. Pop culture ni...,Bandung,High School,"[Advanced Vehicle Diagnostics and Management, ...",31,6
6162,Friendly zombie specialist. Avid bacon expert....,Kalimantan,Diploma,"[Advanced Auto Diagnostic Techniques, Advanced...",22,7
6163,Wannabe coffee practitioner. Troublemaker. Com...,Java,High School,"[Heavy Vehicle Service, Advanced Vehicle Diagn...",21,5


### Categorizing

In [48]:
df['location'] = pd.Categorical(df.location, ordered=True,
                                categories=['Jakata',
                                        'Sumatra',
                                        'Java',
                                        'Bali',
                                        'Kalimantan',
                                        'Palangkaraya',
                                        'Samarinda',
                                        'Banjarmasin',
                                        'Bandung'])
df['Qualification'] = pd.Categorical(df.Qualification, ordered=True,
                                categories=['High School',
                                        'Diploma',
                                        'Bachelors Degree',
                                        'Professional Mechanic',
                                        'Junior Mechanic',
                                        'Mechanic',
                                        'Master',
                                        'Doctorate',
                                        'Associate'])



### Exporting the DF

In [51]:
with open("refined_profiles_1.pkl",'wb') as fp:
    pickle.dump(df, fp)

In [53]:
with open("refined_profiles_1.pkl",'rb') as fp:
    df = pickle.load(fp)

# Viewing the DF    
df.head()

Unnamed: 0,Bios,location,Qualification,Specialisation,Age,year of experience
0,Passionate analyst. Explorer. Hipster-friendly...,Kalimantan,High School,"[Advanced Auto Diagnostic Techniques, Automoti...",21,8
1,Twitter fanatic. Devoted web fanatic. Zombie e...,Bali,High School,"[Advanced Vehicle Diagnostics and Management, ...",29,5
2,Total alcohol practitioner. Social media buff....,Java,Bachelors Degree,"[Rail Apprenticeships, Motor Vehicle Engineering]",23,13
3,Extreme twitter advocate. Hardcore internet ju...,Jakata,Bachelors Degree,"[Automotive Apprenticeships, Motor Vehicle Eng...",20,7
4,Problem solver. Devoted introvert. Food geek. ...,Palangkaraya,High School,"[Advanced Vehicle Diagnostics and Management, ...",24,8
