# Author

- Author: Nicolas Huber
- Mail: nicolassebastian.huber@student.uts.edu.au
- GitHub: HuberNicolas
- Repository: https://github.com/HuberNicolas/python-data-processing-uts

# Source

https://www.kaggle.com/datasets/mexwell/drug-consumption-classification


In [292]:
# Imports

import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import seaborn as sns

import random

In [293]:
# Constants

PATH = './data/drug_consumption.csv'

In [294]:
# Dev variables
DEV = True
DEV_FRACTION = 0.01
RANDOM_STATE = 31011997
np.random.seed(RANDOM_STATE)

In [295]:
# Explicitly define the order of the target column values
CATEGORIES_ORDER = ['cl0', 'cl1', 'cl2', 'cl3', 'cl4', 'cl5', 'cl6']

In [296]:
# Mapping categories based on source
CATEGORIES = {
    'cl0': 'Never Used',
    'cl1': 'Used over a Decade Ago',
    'cl2': 'Used in Last Decade',
    'cl3': 'Used in Last Year',
    'cl4': 'Used in Last Month',
    'cl5': 'Used in Last Week',
    'cl6': 'Used in Last Day'
}

In [297]:
COLUMNS_TYPE = {
    'ID': int,
    'Age': float,
    'Gender': float,
    'Education': float,
    'Country': float,
    'Ethnicity': float,
    'Nscore': float,
    'Escore': float,
    'Oscore': float,
    'Ascore': float,
    'Cscore': float,
    'Impulsive': float,
    'SS': float,
    'Alcohol': str,
    'Amphet': str,
    'Amyl': str,
    'Benzos': str,
    'Caff': str,
    'Cannabis': str,
    'Choc': str,
    'Coke': str,
    'Crack': str,
    'Ecstasy': str,
    'Heroin': str,
    'Ketamine': str,
    'Legalh': str,
    'LSD': str,
    'Meth': str,
    'Mushrooms': str,
    'Nicotine': str,
    'Semer': str,
    'VSA': str
}

In [298]:
# Define columns for better clarity
RENAME_COLUMNS = {
    # Big Five
    # https://en.wikipedia.org/wiki/Big_Five_personality_traits
    'Nscore': 'Neuroticism_score',
    'Escore': 'Extraversion_score',
    'Oscore': 'Openness_score',
    'Ascore': 'Agreeableness_score',
    'Cscore': 'Conscientiousness_score',

    'Impulsive': 'Impulsive_score',
    'SS': 'Sensation_seeing_score'
}

#  Preparation: Load data

In [299]:
# Load data
try:
    df = pd.read_csv(filepath_or_buffer=PATH, header=0)
except FileNotFoundError:
    print('The specified file path does not exist.')
except Exception as e:
    print(f'An unexpected error occurred: {e}')

In [300]:
# For efficiency, using during development only
if DEV:
    # in dev mode, we reduce the data set quantity by sampling 10% to ease development speed
    df = df.sample(frac=DEV_FRACTION, random_state=RANDOM_STATE)
else:
    pass

In [301]:
# Display top rows
df.head(n=5)

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy,Heroin,Ketamine,Legalh,LSD,Meth,Mushrooms,Nicotine,Semer,VSA
1442,1446,1.09449,0.48246,1.16365,0.96082,-0.31685,-1.43907,0.16767,0.29338,0.94156,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL1,CL0,CL0
1091,1095,-0.95197,-0.48246,0.45468,-0.09765,-0.31685,-1.69163,-0.80615,-0.58331,-1.21213,...,CL4,CL0,CL0,CL4,CL4,CL0,CL3,CL0,CL0,CL4
1342,1346,-0.95197,-0.48246,-1.22751,-0.28519,-0.31685,0.82562,1.11406,1.65653,1.2861,...,CL0,CL0,CL2,CL5,CL0,CL0,CL0,CL5,CL0,CL0
794,798,-0.95197,0.48246,0.45468,0.96082,-0.31685,-0.58016,0.80523,0.44585,0.59042,...,CL3,CL0,CL0,CL2,CL0,CL0,CL0,CL5,CL0,CL2
184,185,1.82213,-0.48246,-1.22751,0.96082,-0.31685,0.04257,-0.80615,-0.97631,0.59042,...,CL0,CL0,CL0,CL0,CL0,CL0,CL0,CL6,CL0,CL0


In [302]:
# Define the mapping for age values
age_mapping = {
    -0.95197: '18 - 24',
    -0.07854: '25 - 34',
    0.49788: '35 - 44',
    1.09449: '45 - 54',
    1.82213: '55 - 64',
    2.59171: '65+'
}

# Create the 'age_values' column using the mapping
df['age_values'] = df['Age'].map(age_mapping)

In [303]:
# Define the mapping for gender values
gender_mapping = {
    0.48246: 'Female',
    -0.48246: 'Male'
}

# Create the 'gender_values' column using the mapping
df['gender_values'] = df['Gender'].map(gender_mapping)

In [304]:
# Define the mapping for education values
education_mapping = {
    -2.43591: 'Left School Before 16 years',
    -1.73790: 'Left School at 16 years',
    -1.43719: 'Left School at 17 years',
    -1.22751: 'Left School at 18 years',
    -0.61113: 'Some College, No Certificate Or Degree',
    -0.05921: 'Professional Certificate/Diploma',
    0.45468: 'University Degree',
    1.16365: 'Masters Degree',
    1.98437: 'Doctorate Degree'
}

# Create the 'education_values' column using the mapping
df['education_values'] = df['Education'].map(education_mapping)

In [305]:
# Define the mapping for country values
country_mapping = {
    -0.09765: 'Australia',
    0.24923: 'Canada',
    -0.46841: 'New Zealand',
    -0.28519: 'Other',
    0.21128: 'Republic of Ireland',
    0.96082: 'UK',
    -0.57009: 'USA'
}

# Create the 'country_values' column using the mapping
df['country_values'] = df['Country'].map(country_mapping)

In [306]:
# Define the mapping for ethnicity values
ethnicity_mapping = {
    -0.50212: 'Asian',
    -1.10702: 'Black',
    1.90725: 'Mixed-Black/Asian',
    0.12600: 'Mixed-White/Asian',
    -0.22166: 'Mixed-White/Black',
    0.11440: 'Other',
    -0.31685: 'White'
}

# Create the 'ethnicity_values' column using the mapping
df['ethnicity_values'] = df['Ethnicity'].map(ethnicity_mapping)


In [307]:
# Define the mapping for Nscore values
nscore_mapping = {
    -3.46436: 12,
    -3.15735: 13,
    -2.75696: 14,
    -2.52197: 15,
    -2.42317: 16,
    -2.34360: 17,
    -2.21844: 18,
    -2.05048: 19,
    -1.86962: 20,
    -1.69163: 21,
    -1.55078: 22,
    -1.43907: 23,
    -1.32828: 24,
    -1.19430: 25,
    -1.05308: 26,
    -0.92104: 27,
    -0.79151: 28,
    -0.67825: 29,
    -0.58016: 30,
    -0.46725: 31,
    -0.34799: 32,
    -0.24649: 33,
    -0.14882: 34,
    -0.05188: 35,
    0.04257: 36,
    0.13606: 37,
    0.22393: 38,
    0.31287: 39,
    0.41667: 40,
    0.52135: 41,
    0.62967: 42,
    0.73545: 43,
    0.82562: 44,
    0.91093: 45,
    1.02119: 46,
    1.13281: 47,
    1.23461: 48,
    1.37297: 49,
    1.49158: 50,
    1.60383: 51,
    1.72012: 52,
    1.83990: 53,
    1.98437: 54,
    2.12700: 55,
    2.28554: 56,
    2.46262: 57,
    2.61139: 58,
    2.82196: 59,
    3.27393: 60
}

# Create the 'nscore_values' column using the mapping
df['nscore_values'] = df['Nscore'].map(nscore_mapping)

In [308]:
# Define the mapping for Escore values
escore_mapping = {
    -3.27393: 16,
    -3.00537: 17,
    -2.72827: 19,
    -2.53830: 20,
    -2.44904: 21,
    -2.32338: 22,
    -2.21069: 23,
    -2.11437: 24,
    -2.03972: 25,
    -1.92173: 26,
    -1.76250: 27,
    -1.63340: 28,
    -1.50796: 29,
    -1.37639: 30,
    -1.23177: 31,
    -1.09207: 32,
    -0.94779: 33,
    -0.80615: 34,
    -0.69509: 35,
    -0.57545: 36,
    -0.43999: 37,
    -0.30033: 38,
    -0.15487: 39,
    0.00332: 40,
    0.16767: 41,
    0.32197: 42,
    0.47617: 43,
    0.63779: 44,
    0.80523: 45,
    0.96248: 46,
    1.11406: 47,
    1.28610: 48,
    1.45421: 49,
    1.58487: 50,
    1.74091: 51,
    1.93886: 52,
    2.12700: 53,
    2.32338: 54,
    2.57309: 55,
    2.85950: 56,
    3.00537: 58,
    3.27393: 59
}

# Create the 'escore_values' column using the mapping
df['escore_values'] = df['Escore'].map(escore_mapping)

In [309]:
# Define the mapping for Oscore values
oscore_mapping = {
    -3.27393: 24,
    -2.85950: 26,
    -2.63199: 28,
    -2.39883: 29,
    -2.21069: 30,
    -2.09015: 31,
    -1.97495: 32,
    -1.82919: 33,
    -1.68062: 34,
    -1.55521: 35,
    -1.42424: 36,
    -1.27553: 37,
    -1.11902: 38,
    -0.97631: 39,
    -0.84732: 40,
    -0.71727: 41,
    -0.58331: 42,
    -0.45174: 43,
    -0.31776: 44,
    -0.17779: 45,
    -0.01928: 46,
    0.14143: 47,
    0.29338: 48,
    0.44585: 49,
    0.58331: 50,
    0.72330: 51,
    0.88309: 52,
    1.06238: 53,
    1.24033: 54,
    1.43533: 55,
    1.65653: 56,
    1.88511: 57,
    1.15324: 58,
    2.44904: 59,
    2.90161: 60
}

# Create the 'oscore_values' column using the mapping
df['oscore_values'] = df['Oscore'].map(oscore_mapping)

In [310]:
# Define the mapping for Ascore values
ascore_mapping = {
    -3.46436: 12,
    -3.15735: 16,
    -3.00537: 18,
    -2.90161: 23,
    -2.78793: 24,
    -2.70172: 25,
    -2.53830: 26,
    -2.35413: 27,
    -2.21844: 28,
    -2.07848: 29,
    -1.92595: 30,
    -1.77200: 31,
    -1.62090: 32,
    -1.47955: 33,
    -1.34289: 34,
    -1.21213: 35,
    -1.07533: 36,
    -0.91699: 37,
    -0.76096: 38,
    -0.60633: 39,
    -0.45321: 40,
    -0.30172: 41,
    -0.15487: 42,
    -0.01729: 43,
    0.13136: 44,
    0.28783: 45,
    0.43852: 46,
    0.59042: 47,
    0.76096: 48,
    0.94156: 49,
    1.11406: 50,
    1.28610: 51,
    1.45039: 52,
    1.61108: 53,
    1.81866: 54,
    2.03972: 55,
    2.23427: 56,
    2.46262: 57,
    2.75696: 58,
    3.15735: 59,
    3.46436: 60
}

# Create the 'ascore_values' column using the mapping
df['ascore_values'] = df['Ascore'].map(ascore_mapping)

In [311]:
# Define the mapping for Cscore values
cscore_mapping = {
    -3.46436: 17,
    -3.15735: 19,
    -2.90161: 20,
    -2.72827: 21,
    -2.57309: 22,
    -2.42317: 23,
    -2.30408: 24,
    -2.18109: 25,
    -2.04506: 26,
    -1.92173: 27,
    -1.78169: 28,
    -1.64101: 29,
    -1.51840: 30,
    -1.38502: 31,
    -1.25773: 32,
    -1.13788: 33,
    -1.01450: 34,
    -0.89891: 35,
    -0.78155: 36,
    -0.65253: 37,
    -0.52745: 38,
    -0.40581: 39,
    -0.27607: 40,
    -0.14277: 41,
    -0.00665: 42,
    0.12331: 43,
    0.25953: 44,
    0.41594: 45,
    0.58489: 46,
    0.75830: 47,
    0.93949: 48,
    1.13407: 49,
    1.30612: 50,
    1.46191: 51,
    1.63088: 52,
    1.81175: 53,
    2.04506: 54,
    2.33337: 55,
    2.63199: 56,
    3.00537: 57,
    3.46436: 59
}


# Create the 'cscore_values' column using the mapping
df['cscore_values'] = df['Cscore'].map(cscore_mapping)

In [312]:
# Define the mapping for impulsiveness values
impulsive_mapping = {
    -2.55524: 20,
    -1.37983: 276,
    -0.71126: 307,
    -0.21712: 355,
    0.19268: 257,
    0.52975: 216,
    0.88113: 195,
    1.29221: 148,
    1.86203: 104,
    2.90161: 7
}

# Create the 'impulsive_values' column using the mapping
df['impulsive_values'] = df['Impulsive'].map(impulsive_mapping)

In [313]:
# Define the mapping for sensation values
sensation_mapping = {
    -2.07848: 71,
    -1.54858: 87,
    -1.18084: 132,
    -0.84637: 169,
    -0.52593: 211,
    -0.21575: 223,
    0.07987: 219,
    0.40148: 249,
    0.76540: 211,
    1.22470: 210,
    1.92173: 103
}

# Create the 'sensation_values' column using the mapping
df['sensation_values'] = df['SS'].map(sensation_mapping)

In [314]:
# Define the mapping for drug use values
drug_use_mapping = {
    'CL0': 'Never Used',
    'CL1': 'Used over a Decade Ago',
    'CL2': 'Used in Last Decade',
    'CL3': 'Used in Last Year',
    'CL4': 'Used in Last Month',
    'CL5': 'Used in Last Week',
    'CL6': 'Used in Last Day'
}

# List of columns that need this mapping
drug_columns = [
    'Alcohol', 'Amphet', 'Amyl', 'Benzos', 'Cannabis', 'Choc', 'Coke', 'Caff',
    'Crack', 'Ecstasy', 'Heroin', 'Ketamine', 'Legalh', 'LSD', 'Meth',
    'Mushrooms', 'Nicotine', 'Semer', 'VSA'
]

# Apply the mapping to each of the drug columns
for col in drug_columns:
    df[f'{col}_values'] = df[col].map(drug_use_mapping)

In [315]:
df.head()

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy_values,Heroin_values,Ketamine_values,Legalh_values,LSD_values,Meth_values,Mushrooms_values,Nicotine_values,Semer_values,VSA_values
1442,1446,1.09449,0.48246,1.16365,0.96082,-0.31685,-1.43907,0.16767,0.29338,0.94156,...,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,Used over a Decade Ago,Never Used,Never Used
1091,1095,-0.95197,-0.48246,0.45468,-0.09765,-0.31685,-1.69163,-0.80615,-0.58331,-1.21213,...,Used in Last Month,Never Used,Never Used,Used in Last Month,Used in Last Month,Never Used,Used in Last Year,Never Used,Never Used,Used in Last Month
1342,1346,-0.95197,-0.48246,-1.22751,-0.28519,-0.31685,0.82562,1.11406,1.65653,1.2861,...,Never Used,Never Used,Used in Last Decade,Used in Last Week,Never Used,Never Used,Never Used,Used in Last Week,Never Used,Never Used
794,798,-0.95197,0.48246,0.45468,0.96082,-0.31685,-0.58016,0.80523,0.44585,0.59042,...,Used in Last Year,Never Used,Never Used,Used in Last Decade,Never Used,Never Used,Never Used,Used in Last Week,Never Used,Used in Last Decade
184,185,1.82213,-0.48246,-1.22751,0.96082,-0.31685,0.04257,-0.80615,-0.97631,0.59042,...,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,Used in Last Day,Never Used,Never Used


In [316]:
NUM_MISSING_COLUMNS = 4
NUM_OUTLIER_COLUMNS = 2

NUM_MISSING_VALUES_PER_ROW = 10
NUM_DUPLICATES_PER_ROW = 2
NUM_OUTLIERS_PER_ROW = 10
OUTLIERS_VARIATION = 3

In [317]:
target_col_index = df.columns.get_loc('VSA')
missing_columns = random.choices(df.columns[target_col_index+1:],k=NUM_MISSING_COLUMNS)
missing_columns

['education_values', 'impulsive_values', 'Ecstasy_values', 'Nicotine_values']

In [318]:
outlier_columns = random.choices(['nscore_values', 'escore_values', 'oscore_values', 'ascore_values', 'cscore_values', 'impulsive_values', 'sensation_values'], k=NUM_OUTLIER_COLUMNS)
outlier_columns

['nscore_values', 'cscore_values']

In [319]:
def introduce_missing_values(df, columns, num_missing, random_state=None):
    np.random.seed(random_state)
    df_copy = df.copy()
    
    for col in columns:
        missing_indices = np.random.choice(df_copy.index, num_missing, replace=False)
        if df_copy[col].dtype == 'object':
            df_copy.loc[missing_indices, col] = ''
        else:
            df_copy.loc[missing_indices, col] = np.nan
    
    return df_copy

In [320]:
df = introduce_missing_values(df, columns=missing_columns, num_missing=NUM_MISSING_VALUES_PER_ROW, random_state=RANDOM_STATE)

In [321]:
def introduce_duplicate_rows(df, num_duplicates, random_state=None):
    np.random.seed(random_state)
    df_copy = df.copy()
    
    duplicate_indices = np.random.choice(df_copy.index, num_duplicates)
    duplicate_rows = df_copy.loc[duplicate_indices]
    df_copy = pd.concat([df_copy, duplicate_rows], ignore_index=True)
    
    return df_copy

In [322]:
df = introduce_duplicate_rows(df, num_duplicates=NUM_DUPLICATES_PER_ROW, random_state=RANDOM_STATE)

In [323]:
def introduce_outliers(df, columns, num_outliers, variation=0, random_state=None):
    np.random.seed(random_state)
    df_copy = df.copy()
    
    for col in columns:
        outlier_count = np.random.randint(max(0, num_outliers-variation), num_outliers+variation+1)
        outlier_indices = np.random.choice(df_copy.index, outlier_count, replace=False)
        
        mean = df_copy[col].mean()
        std_dev = df_copy[col].std()
        
        # Introduce outliers as values far from the mean
        outliers = np.random.choice([mean + 3 * std_dev, mean - 3 * std_dev], outlier_count)
        df_copy.loc[outlier_indices, col] = outliers
    
    return df_copy

In [324]:
df = introduce_outliers(df, columns=outlier_columns, num_outliers=NUM_OUTLIERS_PER_ROW, variation=OUTLIERS_VARIATION, random_state=RANDOM_STATE)

  9.81180397 65.99771984  9.81180397  9.81180397  9.81180397 65.99771984]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_copy.loc[outlier_indices, col] = outliers
 65.39346092 23.93987241 65.39346092 23.93987241 23.93987241]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df_copy.loc[outlier_indices, col] = outliers


In [325]:
df

Unnamed: 0,ID,Age,Gender,Education,Country,Ethnicity,Nscore,Escore,Oscore,Ascore,...,Ecstasy_values,Heroin_values,Ketamine_values,Legalh_values,LSD_values,Meth_values,Mushrooms_values,Nicotine_values,Semer_values,VSA_values
0,1446,1.09449,0.48246,1.16365,0.96082,-0.31685,-1.43907,0.16767,0.29338,0.94156,...,,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,,Never Used,Never Used
1,1095,-0.95197,-0.48246,0.45468,-0.09765,-0.31685,-1.69163,-0.80615,-0.58331,-1.21213,...,,Never Used,Never Used,Used in Last Month,Used in Last Month,Never Used,Used in Last Year,Never Used,Never Used,Used in Last Month
2,1346,-0.95197,-0.48246,-1.22751,-0.28519,-0.31685,0.82562,1.11406,1.65653,1.2861,...,Never Used,Never Used,Used in Last Decade,Used in Last Week,Never Used,Never Used,Never Used,,Never Used,Never Used
3,798,-0.95197,0.48246,0.45468,0.96082,-0.31685,-0.58016,0.80523,0.44585,0.59042,...,Used in Last Year,Never Used,Never Used,Used in Last Decade,Never Used,Never Used,Never Used,,Never Used,Used in Last Decade
4,185,1.82213,-0.48246,-1.22751,0.96082,-0.31685,0.04257,-0.80615,-0.97631,0.59042,...,,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,Used in Last Day,Never Used,Never Used
5,1428,-0.95197,0.48246,-0.05921,0.96082,0.1144,0.04257,0.00332,-0.17779,0.28783,...,,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,,Never Used,Used in Last Day
6,689,0.49788,-0.48246,-0.61113,-0.57009,-0.31685,1.02119,1.11406,-0.31776,-1.21213,...,Used over a Decade Ago,Never Used,Never Used,Never Used,Never Used,Never Used,Never Used,,Never Used,Never Used
7,887,-0.07854,-0.48246,-0.61113,-0.28519,-0.31685,1.02119,-2.72827,0.14143,-0.01729,...,Used in Last Decade,Never Used,Never Used,Used in Last Week,Used in Last Decade,Never Used,Used in Last Month,Used in Last Day,Never Used,Never Used
8,1325,-0.95197,0.48246,-0.61113,-0.57009,-0.31685,0.41667,-0.57545,1.06238,0.43852,...,,Never Used,Never Used,Used in Last Year,Never Used,Used in Last Year,Used in Last Month,,Never Used,Never Used
9,1335,-0.07854,-0.48246,-0.61113,-0.28519,-0.31685,-2.52197,0.32197,0.7233,2.46262,...,Never Used,Never Used,Never Used,Used in Last Year,Never Used,Never Used,Used in Last Year,Used in Last Day,Never Used,Never Used


#  TASK 1: Resolve data quality issues using Python and relevant Python packages

##  TASK 1.1: Missing values

##  TASK 1.2: Duplicate values

##  TASK 1.2: Outliers

#  TASK 2: You must specify and answer three questions using appropriate data visualisation techniques

##  TASK 2.1: Q1 - 

##  TASK 2.2: Q2 - 

##  TASK 2.3: Q3 - 