<a href="https://colab.research.google.com/github/PigeonLore/Proj2/blob/main/Proj2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [451]:
import numpy as np
import pandas as pd
import seaborn as sns
from google.colab import drive
import matplotlib.pyplot as plt
import re

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, ConfusionMatrixDisplay

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import LogisticRegression

from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [452]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


###### Custom Functions

In [453]:
'''
Running this function will loop through all rows in the specified column
and remove ascii characters (excluding decimal points) from values.

'''

def clean_cols(df, col):
    
    # Loop through columns within the dataframe for the length of rows in the specified column
    for i in range(len(df[col])):

        # Search for and resub values within the cell respective to the index
        num = re.sub(r'[^0-9\.]+','', df[col][i])  
        
        # Assigns new value to the cell of the respective index 
        df[col][i] = float(num)
        
    return df

In [454]:
'''
Running this function will loop through all rows in the specified column and returns
the values that contains an ascii character (excluding decimal points)

'''

def search_cols(df, col):
    
    # Loop through columns within the dataframe for the length of rows in the specified column
    for i in range(len(df[col])):

        # Search for values within the cell respective to the index
        num = re.search(r'[^0-9\.]+','', df[col][i])  
        
    return df

In [455]:
from IPython.core.debugger import coloransi
'''
This function loops through the dataframe columns and prints the 
data type, column name, and unique elements within the column.

There are three kwargs; 

col allows the user to specify the column of interest,

show_count will toggle the value_counts function, 

sort_values sorts the data in ascending order.

'''

def LU(df, col=None, show_count=False, sort_values=False):

    if col != None:
        unique_vals = df[col].unique()
        if sort_values:
            unique_vals = sorted(unique_vals) 

        print(df[col].dtype,
              '\n',
              col, 
              unique_vals,
              df[col].value_counts() if show_count else "",
              '\n'
              )
    else:
        for i in df.columns:
            unique_vals = df[i].unique()
            if sort_values:
                unique_vals = sorted(unique_vals)

            print(df[i].dtype,
                  '\n',
                  col,
                  unique_vals,
                  df[i].value_counts() if show_count else "",
                  '\n'
                  )

# Adult Earnings (Classification)

In [456]:
df1 = pd.read_csv('/content/drive/MyDrive/Data/adult.csv')

In [457]:
df1.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [458]:
row, col = df1.shape

print(f'Number of rows: {row}\nNumber of columns: {col}')

Number of rows: 48842
Number of columns: 15


In [459]:
# Display missing values:

df1.isna().any().sum()

0

In [460]:
# Display duplicated values:

df1.duplicated().sum()

52

In [461]:
# Display count of rows and columns

row, col = df1.shape

print(f'Number of rows: {row}\nNumber of columns: {col}')

Number of rows: 48842
Number of columns: 15


In [462]:
# Assess the dtype, unique values, and ammount of unique values within each column

LU(df1)

int64 
 None [25 38 28 44 18 34 29 63 24 55 65 36 26 58 48 43 20 37 40 72 45 22 23 54
 32 46 56 17 39 52 21 42 33 30 47 41 19 69 50 31 59 49 51 27 57 61 64 79
 73 53 77 80 62 35 68 66 75 60 67 71 70 90 81 74 78 82 83 85 76 84 89 88
 87 86]  

object 
 None ['Private' 'Local-gov' '?' 'Self-emp-not-inc' 'Federal-gov' 'State-gov'
 'Self-emp-inc' 'Without-pay' 'Never-worked']  

int64 
 None [226802  89814 336951 ... 129912 255835 257302]  

object 
 None ['11th' 'HS-grad' 'Assoc-acdm' 'Some-college' '10th' 'Prof-school'
 '7th-8th' 'Bachelors' 'Masters' 'Doctorate' '5th-6th' 'Assoc-voc' '9th'
 '12th' '1st-4th' 'Preschool']  

int64 
 None [ 7  9 12 10  6 15  4 13 14 16  3 11  5  8  2  1]  

object 
 None ['Never-married' 'Married-civ-spouse' 'Widowed' 'Divorced' 'Separated'
 'Married-spouse-absent' 'Married-AF-spouse']  

object 
 None ['Machine-op-inspct' 'Farming-fishing' 'Protective-serv' '?'
 'Other-service' 'Prof-specialty' 'Craft-repair' 'Adm-clerical'
 'Exec-managerial' 'Tech-suppor

## Q & A's

1. Source of data


Source: https://www.kaggle.com/datasets/wenruliu/adult-income-dataset


2. Brief description of data &  3. What is the target?


This data set describes the income of adults with the target being income.



4. What does one row represent? (A person?  A business?  An event? A product?)

One row represents an individual's government collected personal data.

5. Is this a classification or regression problem?


This is a binary classification problem as the intention is to classify if the income of the adult is above or below $50k.


6. How many features does the data have?


This data set will be trained using 10 features


7. How many rows are in the dataset?


There are 48842 rows


8. What, if any, challenges do you foresee in cleaning, exploring, or modeling this dataset?

The foreseeable challenges would be modeling

# Wine Price (Regression)

In [463]:
df2 = pd.read_csv('/content/drive/MyDrive/Data/wines_SPA.csv')

In [464]:
# Display head to visualize dataset

df2.head()

Unnamed: 0,winery,wine,year,rating,num_reviews,country,region,price,type,body,acidity
0,Teso La Monja,Tinto,2013,4.9,58,Espana,Toro,995.0,Toro Red,5.0,3.0
1,Artadi,Vina El Pison,2018,4.9,31,Espana,Vino de Espana,313.5,Tempranillo,4.0,2.0
2,Vega Sicilia,Unico,2009,4.8,1793,Espana,Ribera del Duero,324.95,Ribera Del Duero Red,5.0,3.0
3,Vega Sicilia,Unico,1999,4.8,1705,Espana,Ribera del Duero,692.96,Ribera Del Duero Red,5.0,3.0
4,Vega Sicilia,Unico,1996,4.8,1309,Espana,Ribera del Duero,778.06,Ribera Del Duero Red,5.0,3.0


In [465]:
# Display total missing data

df2.isna().any().sum()

4

In [466]:
# Display total duplicated data

df2.duplicated().any().sum()

1

In [467]:
# Display count of rows and columns

row, col = df2.shape

print(f'Number of rows: {row}\nNumber of columns: {col}')

Number of rows: 7500
Number of columns: 11


In [468]:
# Assess the dtype, unique values, and ammount of unique values within each column


LU(df2, 'wine')

object 
 wine ['Tinto' 'Vina El Pison' 'Unico' 'Unico Reserva Especial Edicion'
 'El Anejon' 'Don PX Convento Seleccion' 'Cuesta de Las Liebres' 'El Nido'
 'Toneles Moscatel' 'Pingus' 'Don PX Pedro Ximenez'
 "L'Ermita Velles Vinyes Priorat" 'Vatan Arena Tinta de Toro'
 'Ribera Del Duero Gran Reserva 12 Anos' 'Pesus Ribera del Duero' 'Magico'
 'La Faraona Bierzo (Corullon)' 'Gran Reserva 890' 'Valbuena 5o'
 'Castillo Ygay Gran Reserva Especial Blanco' 'La Nieta'
 'Malleolus de Valderramiro' 'Malleolus de Sanchomartin' 'Alabaster'
 'La Mula de la Quietud' 'Terreus Paraje de Cueva Baja' 'Contador Rioja'
 'Maria Remirez de Ganuza' 'Cartago Paraje de Pozo'
 'Parcela El Picon Tinto' 'Termanthia' 'Clon De La Familia'
 'Aquilon Garnacha' 'Quinon de Valmira' '1902 Centenary Carignan Priorat'
 'Tintilla de Rota' 'Cirsion Rioja' 'Cami Pesseroles' "Turo d'en Mota"
 'Priorat' 'Reliquia Palo Cortado Sherry' 'Anada Palo Cortado 1987'
 'Daphne Glorian Red' 'El Regollar' 'Abuelo Diego Palo Cortado'
 'L

## Q & A's

1. Source of data

Source: https://www.kaggle.com/datasets/sagnik1511/car-insurance-data


2. Brief description of data.  


This data set describes the annual car insurance data of an insurance company. 


3. What is the target?


The target is price of wine given featured data.


4. What does one row represent? (A person?  A business?  An event? A product?)

One row represents a bottle of wine.

5. Is this a classification or regression problem?


This is a regression problem as the goal is to determine the continuous value, price.


6. How many features does the data have?


This data set will be trained using 8 features.


7. How many rows are in the dataset?


There are 7500 rows.


8. What, if any, challenges do you foresee in cleaning, exploring, or modeling this dataset?

There are a myriad of catagorical columns containing highly unique data elements such as winery location, types of wine, body of wine, etc.
The immediate foreseeable challenge will be determining the type of statistical variable to assign each column.