In [7]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, roc_curve, roc_auc_score, ConfusionMatrixDisplay

In [46]:
data = pd.read_csv('50_Startups.csv')
data.dtype

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

### Soal Tipe Data

In [83]:
# Data

company_name_list = [{'name': 'Company 1'},
          {'name': 'Company 2'},
          {'name': 'Company 3'}]

employee_name_list = [{'name': 'John Doe'},
          {'name': 'Tom Smith'},
          {'name': 'Andrew Sebastian'}]

company_detail_list = {
      'Company 1': {
          'name': 'Company 1',
          'domain': 'Retail',
          'clients': [
              {
                  'name': 'acme.inc',
                  'country': 'united states'
              },
              {
                  'name': 'Wayne.co',
                  'country': 'united states'
              }
          ]
      },
      'Company 2': {
          'name': 'Company 2',
          'domain': 'Construction',
          'clients': [
              {
                  'name': 'Tesla',
                  'country': 'united states'
              },
              {
                  'name': 'Japan Airlines',
                  'country': 'japan'
              },
              {
                  'name': 'Indofood',
                  'country': 'indonesia'
              }
          ]
      },
      'Company 3': {
          'name': 'Company 3',
          'domain': 'Healthcare',
          'clients': [
              {
                  'name': 'Petronas',
                  'country': 'malaysia'
              },
              {
                  'name': 'VW Group',
                  'country': 'germany'
              },
              {
                  'name': 'IBM',
                  'country': 'united states'
              },
              {
                  'name': 'Mitsubishi',
                  'country': 'japan'
              }
          ]
      }
  }

employee_detail_list = {
      'John Doe': {
          'name': 'EMP-0001',
          'first_name': 'John',
          'last_name': 'Doe',
          'full_name': 'John Doe',
          'company': 'Company 1'
      },
      'Tom Smith': {
          'name': 'EMP-0002',
          'first_name': 'Tom',
          'last_name': 'Smith',
          'full_name': 'Tom Smith',
          'company': 'Company 2'
      },
      'Andrew Sebastian': {
          'name': 'EMP-0003',
          'first_name': 'Andrew',
          'last_name': 'Sebastian',
          'full_name': 'Andrew Sebastian',
          'company': 'Company 2'
      },
  }

### 1

In [84]:
sorted_companies = sorted(
    [{'name': company['name'], 'domain': company_detail_list[company['name']]['domain']}
     for company in company_name_list],
    key=lambda x: x['domain'],
    reverse=True
)

print(sorted_companies)


[{'name': 'Company 1', 'domain': 'Retail'}, {'name': 'Company 3', 'domain': 'Healthcare'}, {'name': 'Company 2', 'domain': 'Construction'}]


### 2

In [85]:
for company in company_name_list:
    company_name = company['name']
    domain = company_detail_list[company_name]['domain']
    num_clients = len(company_detail_list[company_name]['clients'])
    print(f"{company_name}: {domain}, relation: {num_clients} clients")


Company 1: Retail, relation: 2 clients
Company 2: Construction, relation: 3 clients
Company 3: Healthcare, relation: 4 clients


### 3

In [86]:
def get_employee_company_domain():
    return [{'full_name': employee_detail['full_name'],
             'company': employee_detail['company'],
             'domain': company_detail_list[employee_detail['company']]['domain']}
            for employee_name, employee_detail in employee_detail_list.items()]

print(get_employee_company_domain())


[{'full_name': 'John Doe', 'company': 'Company 1', 'domain': 'Retail'}, {'full_name': 'Tom Smith', 'company': 'Company 2', 'domain': 'Construction'}, {'full_name': 'Andrew Sebastian', 'company': 'Company 2', 'domain': 'Construction'}]


### 4

In [87]:
def get_companies_with_employees():
    return [{'company': company['name'],
             'employees': [employee_detail['full_name'] for employee_detail in employee_detail_list.values() if employee_detail['company'] == company['name']]}
            for company in company_name_list]

print(get_companies_with_employees())


[{'company': 'Company 1', 'employees': ['John Doe']}, {'company': 'Company 2', 'employees': ['Tom Smith', 'Andrew Sebastian']}, {'company': 'Company 3', 'employees': []}]


### Soal pre-processing data

In [90]:
data_startup = pd.read_csv('50_Startups.csv')
data_startup.sample(7)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
27,72107.6,127864.55,353183.81,New York,105008.31
41,27892.92,84710.77,164470.71,Florida,77798.83
39,38558.51,82982.09,174999.3,California,81005.76
12,93863.75,127320.38,249839.44,Florida,141585.52
31,,152701.92,88218.23,New York,
17,94657.16,145077.58,282574.31,New York,125370.37
21,78389.47,153773.43,299737.29,New York,111313.02


### 1

In [110]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menampilkan field mana saja yang memiliki data kosong
fields_with_missing_data = df.columns[df.isnull().any()]
print("Fields dengan data kosong:", fields_with_missing_data)

# Mengisi data kosong dengan nilai mean
for field in fields_with_missing_data:
    mean_value = df[field].mean()
    df[field].fillna(mean_value, inplace=True)

# Menyimpan hasil perubahan
df.to_csv("50_Startups.csv", index=False)

df.sample(5)


Fields dengan data kosong: Index([], dtype='object')


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
3,1.584537,-0.161817,1.478634,1.78867,New York
40,-1.126199,-0.16676,-0.4704726,-0.846135,California
10,-1.444366e-16,-0.479207,0.05167498,0.862754,Florida
2,1.797168,-0.85046,1.707769,1.993801,Florida
19,0.2258009,1.207206,1.598721e-16,0.275056,New York


### 2

In [109]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Melakukan OneHotEncoder ke field State
df = pd.get_dummies(df, columns=['State'])

# Menyimpan hasil perubahan
df.to_csv("50_Startups_onehot.csv", index=False)

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
33,-0.499272,-0.7753361,-0.08288993,-0.379427,False,True,False
7,1.254557,6.655787e-16,0.9290873,1.1052,False,True,False
37,-0.767115,6.655787e-16,-0.2459799,-0.551363,True,False,False
21,0.037527,1.217395,0.7054687,-0.01354,False,False,True
48,-1.787653,-2.791579,1.598721e-16,-1.917721,False,False,True


### 3

In [107]:
# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menghitung nilai Tax
df['Tax'] = (df['Profit'] + df['Marketing Spend'] + df['Administration']) * 0.05

# Menyimpan hasil perubahan
df.to_csv("50_Startups_tax.csv", index=False)

df.sample(5)

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State,Tax
8,1.025831,1.018794,0.8154837,1.016062,New York,0.142517
10,-1.700419e-16,-0.4792072,0.05167498,0.862754,Florida,0.021761
23,-1.700419e-16,-0.6695015,1.865175e-16,-0.078465,Florida,-0.037398
20,-0.01254345,-0.350597,0.6955305,0.166734,California,0.025583
24,0.0059821,6.727952e-16,-0.768955,-0.083046,New York,-0.0426


### 4

In [108]:
from sklearn.preprocessing import StandardScaler

# Membaca dataset
df = pd.read_csv("50_Startups.csv")

# Menghapus kolom non-numerik (State)
df_numerical = df.drop(columns=['State'])

# Inisialisasi StandardScaler
scaler = StandardScaler()

# Melakukan scaling ke field-field numerik
scaled_data = scaler.fit_transform(df_numerical)

# Membuat DataFrame baru dari data yang sudah di-scale
df_scaled = pd.DataFrame(scaled_data, columns=df_numerical.columns)

# Menambahkan kolom State yang telah dihapus sebelumnya
df_scaled['State'] = df['State']

# Menyimpan hasil perubahan
df_scaled.to_csv("50_Startups.csv", index=False)

df.sample(5)


Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State
23,-1.700419e-16,-0.669502,1.865175e-16,-0.078465,Florida
26,-0.0342304,0.83872,-0.8293979,-0.154,Florida
8,1.025831,1.018794,0.8154837,1.016062,New York
9,1.0913,-0.554449,0.7540503,0.954339,California
35,-0.7215349,-1.482987,-0.1673481,-0.386964,New York
