### 개인 과제 - ML challenge
#### - 성능보다는 전체 구현에 큰 목적을 두고 해보기

아래 3개의 ml 대회 중 하나를 구현해보세요

- 간단한 EDA
- 데이터 전처리
- 모델 구축
- 성능 평가

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='darkgrid')

# 1 - Stroke Prediction

## Task
classification

## dataset

healthcare-dataset-stroke-data

## Context

According to the World Health Organization (WHO) stroke is the 2nd leading cause of death globally, responsible for approximately 11% of total deaths.
This dataset is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.

## Attribute Information

1. id: unique identifier
2. gender: "Male", "Female" or "Other"
3. age: age of the patient
4. hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
5. heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
6. ever_married: "No" or "Yes"
7. work_type: "children", "Govt_jov", "Never_worked", "Private" or "Self-employed"
8. Residence_type: "Rural" or "Urban"
9. avg_glucose_level: average glucose level in blood
10. bmi: body mass index
11. smoking_status: "formerly smoked", "never smoked", "smokes" or "Unknown"*

### target
- stroke: 1 if the patient had a stroke or 0 if not

*Note: "Unknown" in smoking_status means that the information is unavailable for this patient

In [4]:
stroke_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/03_머신러닝/개인 과제 Ml Challenge/healthcare-dataset-stroke-data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


- EDA

In [5]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [6]:
# 결측치가 있는 행 제거
stroke_df.dropna(axis=0, inplace=True)

In [7]:
stroke_df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0,4909.0
mean,37064.313506,42.865374,0.091872,0.049501,105.30515,28.893237,0.042575
std,20995.098457,22.555115,0.288875,0.216934,44.424341,7.854067,0.201917
min,77.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,18605.0,25.0,0.0,0.0,77.07,23.5,0.0
50%,37608.0,44.0,0.0,0.0,91.68,28.1,0.0
75%,55220.0,60.0,0.0,0.0,113.57,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


- train / test 분리
: test 스케일링 하지않는 것 주의!

In [8]:
X = stroke_df.drop(['stroke'],axis=1)
y = stroke_df[['stroke']]

- 데이터 전처리

In [9]:
# 데이터 스케일링(수치형 데이터 - age, avg_glucose_level, bmi)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_tmp = X[['age', 'avg_glucose_level', 'bmi']]
X_scaled = scaler.fit_transform(X_tmp)
X[['age', 'avg_glucose_level', 'bmi']] = X_scaled

In [10]:
X.isna().sum()

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
dtype: int64

In [11]:
# 인코딩 (번주형 데이터 - gender, ever_married, work_type, Residence_type, smoking_status)
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False)

# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
train_cols = ohe.fit_transform(X[cols])
train_cols



array([[0., 1., 0., ..., 1., 0., 0.],
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [12]:
ohe.categories_

[array(['Female', 'Male', 'Other'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'],
       dtype=object),
 array(['Rural', 'Urban'], dtype=object),
 array(['Unknown', 'formerly smoked', 'never smoked', 'smokes'],
       dtype=object)]

In [13]:
cols = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
X = pd.get_dummies(X, columns=cols)

In [14]:
# 데이터 수가 증가함 -> 결측값 존재
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              4909 non-null   int64  
 1   age                             4909 non-null   float64
 2   hypertension                    4909 non-null   int64  
 3   heart_disease                   4909 non-null   int64  
 4   avg_glucose_level               4909 non-null   float64
 5   bmi                             4909 non-null   float64
 6   gender_Female                   4909 non-null   uint8  
 7   gender_Male                     4909 non-null   uint8  
 8   gender_Other                    4909 non-null   uint8  
 9   ever_married_No                 4909 non-null   uint8  
 10  ever_married_Yes                4909 non-null   uint8  
 11  work_type_Govt_job              4909 non-null   uint8  
 12  work_type_Never_worked          49

In [15]:
X

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,9046,1.070138,0,1,2.777698,0.981345,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,31112,1.646563,0,1,0.013842,0.459269,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,60182,0.272012,0,0,1.484132,0.701207,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,1665,1.602222,1,0,1.549193,-0.623083,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,56669,1.690903,0,0,1.821368,0.013595,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,14180,-1.324241,0,0,-0.050094,-1.310695,1,0,0,1,...,0,0,0,1,1,0,1,0,0,0
5106,44873,1.690903,0,0,0.447882,1.414286,1,0,0,0,...,0,0,1,0,0,1,0,0,1,0
5107,19723,-0.348753,0,0,-0.502369,0.217332,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5108,37544,0.360692,0,0,1.372920,-0.419346,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=192)

#### - 모델 구축

In [23]:
# 평가 지표
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, roc_auc_score

def evaluation_metrics(y_test, pred):
    print("confusion matrix : \n",confusion_matrix(y_test, pred))
    print("accuracy : ",accuracy_score(y_test, pred))
    print("precision : ",precision_score(y_test, pred, average = 'macro'))
    print("recall : ",recall_score(y_test, pred, average = 'macro'))
    print("f1_score : ",f1_score(y_test, pred))
    print("roc_auc_score : ",roc_auc_score(y_test, pred))

- 로지스틱 회귀

In [24]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)
pred = log_reg.predict(X_test)
evaluation_metrics(y_test, pred)

confusion matrix : 
 [[1414    0]
 [  59    0]]
accuracy :  0.9599456890699253
precision :  0.47997284453496264
recall :  0.5
f1_score :  0.0
roc_auc_score :  0.5


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, msg_start, len(result))


- SVM

In [39]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

params = {
    'C' : [0.01, 0.1, 1, 10, 100, 1000]
}

svc = LinearSVC()
grid_cls = GridSearchCV(svc, param_grid = params, cv = 4, refit = True, scoring = 'f1_macro')
grid_cls.fit(X_train, y_train)
pred = grid_cls.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [40]:
evaluation_metrics(y_test, pred)

confusion matrix : 
 [[1414    0]
 [  59    0]]
accuracy :  0.9599456890699253
precision :  0.47997284453496264
recall :  0.5
f1_score :  0.0
roc_auc_score :  0.5


  _warn_prf(average, modifier, msg_start, len(result))


In [37]:
grid_cls.best_params_

{'C': 0.1}

In [38]:
grid_cls.best_score_

0.4482592598341071

- lightGBM

In [28]:
from hyperopt import hp
lgbm_search_space = {'num_leaves': hp.quniform('num_leaves', 32, 64, 1),
                     'max_depth': hp.quniform('max_depth', 100, 160, 1),
                     'min_child_samples': hp.quniform('min_child_samples', 60, 100, 1),
                     'subsample': hp.uniform('subsample', 0.7, 1),
                     'learning_rate': hp.uniform('learning_rate', 0.01, 0.2)
                    }

In [32]:
from lightgbm import LGBMClassifier

lgbm_clf =  LGBMClassifier(n_estimators=100, num_leaves=lgbm_search_space['num_leaves'],
                           max_depth=lgbm_search_space['max_depth'],
                           min_child_samples=lgbm_search_space['min_child_samples'], 
                           subsample=lgbm_search_space['subsample'],
                           learning_rate=lgbm_search_space['learning_rate'])

lgb = LGBMClassifier()
lgb.fit(X_train,y_train)
pred = lgb.predict(X_test)
evaluation_metrics(y_test,pred)

confusion matrix : 
 [[1406    8]
 [  58    1]]
accuracy :  0.955193482688391
precision :  0.5357468123861566
recall :  0.5056457219571836
f1_score :  0.02941176470588235
roc_auc_score :  0.5056457219571836


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


## f1-score가 거의 0에 가까움 : 데이터 불균형 고려

In [42]:
print(stroke_df['stroke'].value_counts())
unsatisfied_cnt = stroke_df[stroke_df['stroke'] == 1].stroke.count()
total_cnt = stroke_df.stroke.count()
print('{0:.2f}'.format((unsatisfied_cnt / total_cnt)))

0    4700
1     209
Name: stroke, dtype: int64
0.04


In [47]:
# SMOTE 기법 적용
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train_over, y_train_over = smote.fit_resample(X_train, y_train)

print('오버샘플링 전',X_train.shape, y_train.shape)
print('-'*30)
print('오버샘플링 후',X_train_over.shape, y_train_over.shape)
print('-'*30)
print('SMOTE 적용 후 레이블 값 분포: \n', pd.DataFrame(y_train_over).value_counts())

오버샘플링 전 (3436, 22) (3436, 1)
------------------------------
오버샘플링 후 (6572, 22) (6572, 1)
------------------------------
SMOTE 적용 후 레이블 값 분포: 
 stroke
0         3286
1         3286
dtype: int64


In [48]:
# 로지스틱 회귀
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()

log_reg.fit(X_train_over, y_train_over)
pred = log_reg.predict(X_test)
evaluation_metrics(y_test, pred)

confusion matrix : 
 [[1110  304]
 [  22   37]]
accuracy :  0.778682959945689
precision :  0.5445348849258572
recall :  0.7060628581017908
f1_score :  0.185
roc_auc_score :  0.7060628581017909


  y = column_or_1d(y, warn=True)


In [49]:
# SVM
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV

params = {
    'C' : [0.01, 0.1, 1, 10, 100, 1000]
}

svc = LinearSVC()
grid_cls = GridSearchCV(svc, param_grid = params, cv = 4, refit = True, scoring = 'f1_macro')
grid_cls.fit(X_train_over, y_train_over)
pred = grid_cls.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [50]:
evaluation_metrics(y_test, pred) # 뭔가 문제가 있는 것 같습니다...

confusion matrix : 
 [[ 312 1102]
 [  10   49]]
accuracy :  0.24507807196198234
precision :  0.5057578880908311
recall :  0.525579555534246
f1_score :  0.08099173553719008
roc_auc_score :  0.525579555534246


In [51]:
#lightGBM
lgb.fit(X_train_over, y_train_over)
pred = lgb.predict(X_test)
evaluation_metrics(y_test,pred)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


confusion matrix : 
 [[1397   17]
 [  57    2]]
accuracy :  0.9497623896809233
precision :  0.5330304785347137
recall :  0.5109378371251169
f1_score :  0.05128205128205128
roc_auc_score :  0.5109378371251169


In [None]:
# 상당히 큰일난 것 같습니다!

# 2 - League of Legends Diamond Ranked Games

## Task
classification

## dataset
high_diamond_ranked_10min

## Context
League of Legends is a MOBA (multiplayer online battle arena) where 2 teams (blue and red) face off. There are 3 lanes, a jungle, and 5 roles. The goal is to take down the enemy Nexus to win the game.

# Content
This dataset contains the first 10min. stats of approx. 10k ranked games (SOLO QUEUE) from a high ELO (DIAMOND I to MASTER). Players have roughly the same level.

Each game is unique. The gameId can help you to fetch more attributes from the Riot API.

There are 19 features per team (38 in total) collected after 10min in-game. This includes kills, deaths, gold, experience, level… It's up to you to do some feature engineering to get more insights.

The column blueWins is the target value (the value we are trying to predict). A value of 1 means the blue team has won. 0 otherwise.

So far I know, there is no missing value.

## Glossary

1. Warding totem: An item that a player can put on the map to reveal the nearby area. Very useful for map/objectives control.
1. Minions: NPC that belong to both teams. They give gold when killed by players.
1. Jungle minions: NPC that belong to NO TEAM. They give gold and buffs when killed by players.
1. Elite monsters: Monsters with high hp/damage that give a massive bonus (gold/XP/stats) when killed by a team.
1. Dragons: Elite monster which gives team bonus when killed. The 4th dragon killed by a team gives a massive stats bonus. The 5th dragon (Elder Dragon) offers a huge advantage to the team.
1. Herald: Elite monster which gives stats bonus when killed by the player. It helps to push a lane and destroys structures.
1. Towers: Structures you have to destroy to reach the enemy Nexus. They give gold.
1. Level: Champion level. Start at 1. Max is 18.

## columns

- gameId : Unique RIOT ID of the game. Can be used with the Riot Games API.

- blueWins : The target column. 1 if the blue team has won, 0 otherwise.

- blueWardsPlaced : Number of warding totems placed by the blue team on the map

- blueWardsDestroyed : Number of enemy warding totems the blue team has destroyed

- blueFirstBlood : First kill of the game. 1 if the blue team did the first kill, 0 otherwise

- blueKills : Number of enemies killed by the blue team

- blueDeaths : Number of deaths (blue team)

- blueAssists : Number of kill assists (blue team)

- blueEliteMonsters : Number of elite monsters killed by the blue team (Dragons and Heralds)

- blueDragons : Number of dragons killed by the blue team

- blueHeralds : Number of heralds killed by the blue team

- blueTowersDestroyed : Number of structures destroyed by the blue team (towers...)

- blueTotalGold : Blue team total gold

- blueAvgLevel : Blue team average champion level

- blueTotalExperience : Blue team total experience

- blueTotalMinionsKilled : Blue team total minions killed (CS)

- blueTotalJungleMinionsKilled : Blue team total jungle monsters killed

- blueGoldDiff : Blue team gold difference compared to the enemy team

- blueExperienceDiff : Blue team experience difference compared to the enemy team

- blueCSPerMin : Blue team CS (minions) per minute

- blueGoldPerMin : Blue team gold per minute


### target
- win : red_win(0) or blue_win(1)



# 3 - Used Cars Price Prediction

## Task
Regression

## dataset
Used_Cars_Price_Prediction

## columns
1. index

1. Name :
The brand and model of the car.

1. Location :
The location in which the car is being sold or is available for purchase.

1. Year :
The year or edition of the model.

1. Kilometers_Driven :
The total kilometres driven in the car by the previous owner(s) in KM.

1. Fuel_Type :
The type of fuel used by the car. (Petrol / Diesel / Electric / CNG / LPG)

1. Transmission :
The type of transmission used by the car. (Automatic / Manual)

1. Owner_Type :
Whether the ownership is Firsthand, Second hand or other.

1. Mileage :
The standard mileage offered by the car company in kmpl or km/kg

1. Engine :
The displacement volume of the engine in cc.

1. Power :
The maximum power of the engine in bhp.

1. Seats :
The number of seats in the car.

1. New_Price :
The price of a new car of the same model.

## target 
- Price :
The price of the used car in INR Lakhs.

In [None]:
car_df = pd.read_csv('Used_Cars_Price_Prediction.csv')
car_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [None]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


#### - EDA

- Name

In [None]:
# 회사 이름만 추출하여 저장
# car_name = [i.split(' ')[0] for i in car_df['Name'].values]

car_name = car_df.Name.str.split(' ').str.get(0)
car_df['Name'] = car_name
car_df.rename(columns={'Name' : 'Company'}, inplace=True)

In [None]:
car_df['Company'].nunique()

31

In [None]:
car_df

Unnamed: 0.1,Unnamed: 0,Company,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,2,Honda,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,3,Maruti,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,4,Audi,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,6014,Maruti,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,4.75
6015,6015,Hyundai,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4.00
6016,6016,Mahindra,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,2.90
6017,6017,Maruti,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2.65


- Location

In [None]:
car_df['Location'].nunique()

11

- Mileage

In [None]:
car_df['Mileage'].str.endswith('kmpl')

0       False
1        True
2        True
3        True
4        True
        ...  
6014     True
6015     True
6016     True
6017     True
6018     True
Name: Mileage, Length: 6019, dtype: object

In [None]:
# kmpl로 표기되어 있는 값의 개수
car_df['Mileage'].str.endswith('kmpl').sum()

5951

In [None]:
# km/kg를 kmpl로 변환 : kmpl = 1.4 * km/kg


In [None]:
# 1 Lakh : 100,000 루피 / 1 루피 : 0.012 $ --> 1 Lakh : 1200$
car_df[~car_df['New_Price'].isna()]

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
7,7,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,2016,36000,Diesel,Automatic,First,11.36 kmpl,2755 CC,171.5 bhp,8.0,21 Lakh,17.50
10,10,Maruti Ciaz Zeta,Kochi,2018,25692,Petrol,Manual,First,21.56 kmpl,1462 CC,103.25 bhp,5.0,10.65 Lakh,9.95
15,15,Mitsubishi Pajero Sport 4X4,Delhi,2014,110000,Diesel,Manual,First,13.5 kmpl,2477 CC,175.56 bhp,7.0,32.01 Lakh,15.00
20,20,BMW 3 Series 320d,Kochi,2014,32982,Diesel,Automatic,First,22.69 kmpl,1995 CC,190 bhp,5.0,47.87 Lakh,18.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5999,5999,Tata Bolt Revotron XT,Chennai,2016,10000,Petrol,Manual,First,17.57 kmpl,1193 CC,88.7 bhp,5.0,7.77 Lakh,4.00
6002,6002,Volkswagen Vento 1.6 Highline,Mumbai,2011,38000,Petrol,Manual,First,16.09 kmpl,1598 CC,103.5 bhp,5.0,11.91 Lakh,3.25
6005,6005,Maruti Vitara Brezza VDi,Pune,2016,37208,Diesel,Manual,First,24.3 kmpl,1248 CC,88.5 bhp,5.0,9.93 Lakh,7.43
6010,6010,Honda Brio 1.2 VX MT,Delhi,2013,33746,Petrol,Manual,First,18.5 kmpl,1198 CC,86.8 bhp,5.0,6.63 Lakh,3.20
