In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
# 한글 폰트 설정 (필요한 경우)
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False


In [4]:
# 엑셀 파일 경로 설정
file_path = "/Users/stella/Documents/03.proj/p_truck_delivery/data/winter_internship_prj_250122_v1.0.xlsx"  # 파일명을 실제 파일 이름으로 변경하세요.

# Excel 파일 읽기 (engine='openpyxl' 명시)
excel_data = pd.ExcelFile(file_path, engine='openpyxl')

# 모든 시트 이름 확인 (선택적으로 확인 가능)
print("Available sheets:", excel_data.sheet_names)

# 두 번째 시트의 데이터 읽기 (시트 인덱스는 0부터 시작)
df = pd.read_excel(file_path, sheet_name=1, engine='openpyxl')  # sheet_name=1은 두 번째 시트를 의미

# 데이터프레임 확인
print(df)

Available sheets: ['info', 'data_sample']
        GpsProvider           BookingID Market/Regular   \
0     CONSENT TRACK  MVCV0000927/082021          Market   
1           VAMOSYS  VCV00014271/082021         Regular   
2     CONSENT TRACK  VCV00014382/082021         Regular   
3           VAMOSYS  VCV00014743/082021         Regular   
4           VAMOSYS  VCV00014744/082021         Regular   
...             ...                 ...             ...   
6875          JTECH        WDSBKTP42751         Regular   
6876          JTECH        WDSBKTP43203         Regular   
6877          JTECH        WDSBKTP43021         Regular   
6878          JTECH        WDSBKTP42685         Regular   
6879          JTECH        WDSBKTP42858         Regular   

              BookingID_Date  vehicle_no  \
0    2020-08-17 14:59:01.000    KA590408   
1    2020-08-27 16:22:22.827  TN30BC5917   
2    2020-08-27 17:59:24.987  TN22AR2748   
3    2020-08-28 00:48:24.503  TN28AQ0781   
4    2020-08-28 01:23:19.243 

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6880 entries, 0 to 6879
Data columns (total 32 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   GpsProvider                         5927 non-null   object        
 1   BookingID                           6880 non-null   object        
 2   Market/Regular                      6880 non-null   object        
 3   BookingID_Date                      6880 non-null   datetime64[ns]
 4   vehicle_no                          6880 non-null   object        
 5   Origin_Location                     6880 non-null   object        
 6   Destination_Location                6880 non-null   object        
 7   Org_lat_lon                         6880 non-null   object        
 8   Des_lat_lon                         6880 non-null   object        
 9   Data_Ping_time                      5927 non-null   datetime64[ns]
 10  Planned_ETA             

In [6]:
def convert_to_camel_case(column_name):
    # 특수문자를 공백으로 대체하고 공백을 기준으로 분리
    words = ''.join(c if c.isalnum() else ' ' for c in column_name).split()
    
    # 첫 번째 단어는 소문자로, 나머지 단어들은 첫 글자만 대문자로 변환
    return words[0].lower() + ''.join(word.capitalize() for word in words[1:])

In [7]:
# 데이터프레임의 모든 컬럼명을 camel case로 변환
def convert_columns_to_camel_case(df):
    # 현재 컬럼명과 변환된 컬럼명을 매핑
    column_mapping = {col: convert_to_camel_case(col) for col in df.columns}
    
    # 변경된 컬럼명 출력 (변경 전/후 확인용)
    print("컬럼명 변경 사항:")
    for old_col, new_col in column_mapping.items():
        print(f"{old_col} -> {new_col}")
    
    # 데이터프레임 컬럼명 변경
    df.rename(columns=column_mapping, inplace=True)
    
    return df

In [8]:
# 컬럼명을 camel case로 변환
df = convert_columns_to_camel_case(df)

# 결과 확인
print("\n변환된 컬럼명:")
print(df.columns.tolist())

컬럼명 변경 사항:
GpsProvider -> gpsprovider
BookingID -> bookingid
Market/Regular  -> marketRegular
BookingID_Date -> bookingidDate
vehicle_no -> vehicleNo
Origin_Location -> originLocation
Destination_Location -> destinationLocation
Org_lat_lon -> orgLatLon
Des_lat_lon -> desLatLon
Data_Ping_time -> dataPingTime
Planned_ETA -> plannedEta
Current_Location -> currentLocation
DestinationLocation -> destinationlocation
actual_eta -> actualEta
Curr_lat -> currLat
Curr_lon -> currLon
ontime -> ontime
delay -> delay
OriginLocation_Code -> originlocationCode
DestinationLocation_Code -> destinationlocationCode
trip_start_date -> tripStartDate
trip_end_date -> tripEndDate
TRANSPORTATION_DISTANCE_IN_KM -> transportationDistanceInKm
vehicleType -> vehicletype
Minimum_kms_to_be_covered_in_a_day -> minimumKmsToBeCoveredInADay
Driver_Name -> driverName
Driver_MobileNo -> driverMobileno
customerID -> customerid
customerNameCode -> customernamecode
supplierID -> supplierid
supplierNameCode -> suppliernameco

In [9]:
# 운행시간 처리
df['tripStartDate'] = pd.to_datetime(df['tripStartDate'])
df['tripEndDate'] = pd.to_datetime(df['tripEndDate'])
df['tripDuration'] = (df['tripEndDate'] - df['tripStartDate']).dt.total_seconds() / 3600  # 시간 단위

In [10]:
# 분석에 필요한 feature 정보 추출
List = ['gpsprovider','bookingid', 'bookingidDate', 'vehicleNo', 'originLocation', 'destinationLocation', 'orgLatLon', 'desLatLon', 'plannedEta'
        , 'currentLocation', 'destinationlocation', 'actualEta', 'ontime', 'delay', 'tripStartDate', 'tripEndDate', 'tripDuration', 'transportationDistanceInKm'
        ,'minimumKmsToBeCoveredInADay', 'customerid', 'customernamecode']
test=df[List].copy()

In [11]:
test.head(10)

Unnamed: 0,gpsprovider,bookingid,bookingidDate,vehicleNo,originLocation,destinationLocation,orgLatLon,desLatLon,plannedEta,currentLocation,...,actualEta,ontime,delay,tripStartDate,tripEndDate,tripDuration,transportationDistanceInKm,minimumKmsToBeCoveredInADay,customerid,customernamecode
0,CONSENT TRACK,MVCV0000927/082021,2020-08-17 14:59:01.000,KA590408,"TVSLSL-PUZHAL-HUB,CHENNAI,TAMIL NADU","ASHOK LEYLAND PLANT 1- HOSUR,HOSUR,KARNATAKA","13.1550,80.1960","12.7400,77.8200",2020-08-21 18:59:01,"Vaniyambadi Rd, Valayambattu, Tamil Nadu 63575...",...,2020-08-28 14:38:04.447000,,R,2020-08-17 14:59:01,NaT,,320.0,,ALLEXCHE45,Ashok leyland limited
1,VAMOSYS,VCV00014271/082021,2020-08-27 16:22:22.827,TN30BC5917,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-08-31 20:22:22.827000,"Unnamed Road, Oragadam Industrial Corridor, Va...",...,2020-08-28 12:46:17.007000,G,,2020-08-27 16:21:52,NaT,,103.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt
2,CONSENT TRACK,VCV00014382/082021,2020-08-27 17:59:24.987,TN22AR2748,"LUCAS TVS LTD-PONDY,PONDY,PONDICHERRY","LUCAS TVS LTD-PONDY,PONDY,PONDICHERRY","11.8710,79.7390","11.8710,79.7390",2020-08-31 21:59:24.987000,"570, National Hwy 48, Shenoy Nagar, Chennai, T...",...,2020-08-28 16:03:30.793000,G,,2020-08-27 17:57:04,NaT,,300.0,,LUTGCCHE06,Lucas tvs ltd
3,VAMOSYS,VCV00014743/082021,2020-08-28 00:48:24.503,TN28AQ0781,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-09-01 04:48:24.503000,"Singaperumal Koil - Sriperumbudur Rd, Oragadam...",...,2020-08-28 12:50:27.997000,G,,2020-08-28 00:47:45,NaT,,61.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt
4,VAMOSYS,VCV00014744/082021,2020-08-28 01:23:19.243,TN68F1722,"LUCAS TVS LTD-PONDY,PONDY,PONDICHERRY","LUCAS TVS LTD-PONDY,PONDY,PONDICHERRY","11.8720,79.6320","11.8720,79.6320",2020-09-01 05:23:19.243000,"Melmaruvathur, Tamil Nadu 603319, India",...,2020-08-28 14:22:50.127000,G,,2020-08-28 01:13:48,NaT,,240.0,,LUTGCCHE06,Lucas tvs ltd
5,VAMOSYS,VCV00014749/082021,2020-08-28 02:14:22.640,TN88A4980,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-09-01 06:14:22.640000,"Ind.park Road, Nayapakkam, Tamil Nadu 602105, ...",...,2020-08-28 13:25:50.353000,G,,2020-08-28 02:13:39,NaT,,70.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt
6,VAMOSYS,VCV00014750/082021,2020-08-28 02:20:27.530,TN88C8204,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-09-01 06:20:27.530000,"Rettai Kovil Bus Stop, 64, Salem - Ulundurpett...",...,2020-08-28 17:38:13.480000,G,,2020-08-28 02:19:47,NaT,,931.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt
7,VAMOSYS,VCV00014812/082021,2020-08-28 09:22:31.377,TN88D4133,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-09-01 13:22:31.377000,"Singaperumal Koil - Sriperumbudur Rd, Oragadam...",...,2020-08-28 12:49:03.727000,G,,2020-08-28 09:21:56,NaT,,20.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt
8,CONSENT TRACK,MVCV0001769/082021,2020-08-28 09:38:30.000,TN23AM4662,"ASHOK LEYLAND ENNORE,CHENNAI,TAMIL NADU","ASHOK LEYLAND PLANT 2-HOSUR,HOSUR,KARNATAKA","13.2150,80.3200","12.7660,77.7860",2020-09-01 13:38:30,"Mumbai Hwy, Komeswaram, Tamil Nadu 635802, India",...,2020-08-28 15:00:19.080000,G,,2020-08-28 09:38:30,NaT,,310.0,,ALLEXCHE45,Ashok leyland limited
9,VAMOSYS,VCV00014665/082021,2020-08-27 22:27:54.427,TN30BC5982,"DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","DAIMLER INDIA COMMERCIAL VEHICLES,KANCHIPURAM,...","12.8390,79.9540","12.8390,79.9540",2020-09-01 02:27:54.427000,"Unnamed Road, Oragadam Industrial Corridor, Va...",...,2020-08-28 12:46:20.717000,G,,2020-08-27 22:27:14,NaT,,103.0,,DMREXCHEUX,Daimler india commercial vehicles pvt lt


In [12]:
def analyze_dataframe(df):
    """
    데이터프레임 기초분석 함수
    """
    print("="*50)
    print("1. 데이터프레임 기본 정보")
    print("="*50)
    print(f"데이터 크기: {df.shape}")
    print("\n데이터 타입 정보:")
    print(df.info())
    
    print("\n"+"="*50)
    print("2. 결측치 분석")
    print("="*50)
    # 결측치 개수와 비율 계산
    missing_values = df.isnull().sum()
    missing_ratio = (df.isnull().sum() / len(df)) * 100
    
    missing_info = pd.DataFrame({
        '결측치 개수': missing_values,
        '결측치 비율(%)': missing_ratio.round(2)
    })
    print(missing_info[missing_info['결측치 개수'] > 0])  # 결측치가 있는 컬럼만 출력
    
    print("\n"+"="*50)
    print("3. 수치형 변수 기초 통계량")
    print("="*50)
    numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
    print(df[numeric_columns].describe())
    
    print("\n"+"="*50)
    print("4. 범주형 변수 분포")
    print("="*50)
    categorical_columns = df.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        print(f"\n{col} 변수의 고유값 개수: {df[col].nunique()}")
        print(f"\n상위 5개 빈도:")
        print(df[col].value_counts().head())

In [13]:
analyze_dataframe(test)

1. 데이터프레임 기본 정보
데이터 크기: (6880, 21)

데이터 타입 정보:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6880 entries, 0 to 6879
Data columns (total 21 columns):
 #   Column                       Non-Null Count  Dtype         
---  ------                       --------------  -----         
 0   gpsprovider                  5927 non-null   object        
 1   bookingid                    6880 non-null   object        
 2   bookingidDate                6880 non-null   datetime64[ns]
 3   vehicleNo                    6880 non-null   object        
 4   originLocation               6880 non-null   object        
 5   destinationLocation          6880 non-null   object        
 6   orgLatLon                    6880 non-null   object        
 7   desLatLon                    6880 non-null   object        
 8   plannedEta                   6880 non-null   object        
 9   currentLocation              5916 non-null   object        
 10  destinationlocation          6880 non-null   object        
 

In [14]:
# 새로운 Unique Key 생성
def create_unique_key(df):
    # bookingid와 customerid를 문자열로 변환하고 결합
    df['uniqueKey'] = df['gpsprovider'].astype(str)+ '_' + df['bookingid'].astype(str) + '_' + df['customerid'].astype(str)
    
    # uniqueKey의 고유성 확인
    total_rows = len(df)
    unique_keys = len(df['uniqueKey'].unique())
    
    print(f"전체 행 수: {total_rows}")
    print(f"고유 키 수: {unique_keys}")
    print(f"중복 키 수: {total_rows - unique_keys}")
    
    # 중복된 키가 있다면 중복 케이스 출력
    if total_rows != unique_keys:
        print("\n중복된 키 목록:")
        duplicates = df[df['uniqueKey'].duplicated(keep=False)].sort_values('uniqueKey')
        print(duplicates[['uniqueKey', 'gpsprovider', 'bookingid', 'customerid']])
        
        # 중복 해결을 위해 순번 추가
        df['uniqueKey'] = df.groupby('uniqueKey').cumcount().astype(str) + '_' + df['uniqueKey']
    
    return df

# 함수 실행
test = create_unique_key(test)

# 결과 확인
print("\n최종 Unique Key 확인:")
print(f"전체 행 수: {len(test)}")
print(f"고유 키 수: {len(test['uniqueKey'].unique())}")

# 샘플 데이터 확인
print("\n생성된 Unique Key 샘플:")
print(test[['uniqueKey', 'gpsprovider', 'bookingid', 'customerid']].head())

전체 행 수: 6880
고유 키 수: 6880
중복 키 수: 0

최종 Unique Key 확인:
전체 행 수: 6880
고유 키 수: 6880

생성된 Unique Key 샘플:
                                     uniqueKey    gpsprovider  \
0  CONSENT TRACK_MVCV0000927/082021_ALLEXCHE45  CONSENT TRACK   
1        VAMOSYS_VCV00014271/082021_DMREXCHEUX        VAMOSYS   
2  CONSENT TRACK_VCV00014382/082021_LUTGCCHE06  CONSENT TRACK   
3        VAMOSYS_VCV00014743/082021_DMREXCHEUX        VAMOSYS   
4        VAMOSYS_VCV00014744/082021_LUTGCCHE06        VAMOSYS   

            bookingid  customerid  
0  MVCV0000927/082021  ALLEXCHE45  
1  VCV00014271/082021  DMREXCHEUX  
2  VCV00014382/082021  LUTGCCHE06  
3  VCV00014743/082021  DMREXCHEUX  
4  VCV00014744/082021  LUTGCCHE06  


In [20]:
def create_unified_delivery_status(df):
    """
    ontime('G')와 delay('R') 컬럼을 하나의 배송 상태 컬럼으로 통합
    
    Parameters:
        df: DataFrame with 'ontime' and 'delay' columns
    
    Returns:
        DataFrame with new 'delivery_status' column
    """
    # 기존 값 확인
    print("=== 기존 데이터 값 분포 ===")
    print("\nontime 컬럼 값 분포:")
    print(df['ontime'].value_counts())
    print("\ndelay 컬럼 값 분포:")
    print(df['delay'].value_counts())
    
    # 새로운 배송 상태 컬럼 생성
    df['delivery_status'] = 'UNKNOWN'  # 기본값
    
    # ontime이 'G'인 경우 'ON_TIME'으로 설정
    df.loc[df['ontime'] == 'G', 'delivery_status'] = 'ontime'
    
    # delay가 'R'인 경우 'DELAYED'로 설정
    df.loc[df['delay'] == 'R', 'delivery_status'] = 'delay'
    
    # 데이터 검증
    # ontime이 'G'인데 delay도 'R'인 경우 확인 (모순된 데이터)
    contradictions = df[(df['ontime'] == 'G') & (df['delay'] == 'R')]
    if len(contradictions) > 0:
        print("\n=== 주의: 모순된 데이터 발견 ===")
        print(f"모순된 데이터 수: {len(contradictions)}")
        print("\n모순된 데이터 샘플:")
        print(contradictions[['ontime', 'delay', 'delivery_status']].head())
    
    # 결과 확인
    print("\n=== 새로운 배송 상태 분포 ===")
    print(df['delivery_status'].value_counts())
    
    # 숫자형 컬럼도 추가 (머신러닝 모델링용)
    df['delivery_status_code'] = df['delivery_status'].map({
        'ontime': 0,
        'delay': 1,
        'UNKNOWN': -1
    })
    
    return df

In [21]:
test=create_unified_delivery_status(test)

=== 기존 데이터 값 분포 ===

ontime 컬럼 값 분포:
ontime
G    2548
Name: count, dtype: int64

delay 컬럼 값 분포:
delay
R    4342
Name: count, dtype: int64

=== 주의: 모순된 데이터 발견 ===
모순된 데이터 수: 24

모순된 데이터 샘플:
     ontime delay delivery_status
5820      G     R           delay
5821      G     R           delay
5822      G     R           delay
5823      G     R           delay
5824      G     R           delay

=== 새로운 배송 상태 분포 ===
delivery_status
delay      4342
ontime     2524
UNKNOWN      14
Name: count, dtype: int64


In [23]:
test[test['delivery_status_code']==-1]

Unnamed: 0,gpsprovider,bookingid,bookingidDate,vehicleNo,originLocation,destinationLocation,orgLatLon,desLatLon,plannedEta,currentLocation,...,tripStartDate,tripEndDate,tripDuration,transportationDistanceInKm,minimumKmsToBeCoveredInADay,customerid,customernamecode,uniqueKey,delivery_status,delivery_status_code
1514,VINAYAKA_TVS,AEIBK2025104,2020-08-07 11:27:20,KA52A5807,"Onnalvadi, Krishnagiri, Tamil Nadu","Shive, Pune, Maharashtra","12.683589,77.859239","18.750621,73.87719",2020-08-08 03:32:00,,...,2020-08-07 11:27:20,2020-08-11 14:35:00,99.127778,935.0,,NEMEXGURH5,Neel metal products ltd,VINAYAKA_TVS_AEIBK2025104_NEMEXGURH5,UNKNOWN,-1
1595,VAMOSYS,AEIBK2025125,2020-08-07 15:56:38,GJ02Z0239,"Kalri, Mahesana, Gujarat","Solgam, Ahmedabad, Gujarat","23.5159,72.077925","23.338649,71.975699",2020-08-07 16:46:08,,...,2020-08-07 15:56:38,2020-08-10 09:26:00,65.489444,26.0,,GTKEXGURJS,G-tekt india pvt ltd.,VAMOSYS_AEIBK2025125_GTKEXGURJS,UNKNOWN,-1
2725,CONSENT TRACK,AEIBK2021746,2020-06-30 21:47:14,DL01GC5983,"Peenya Small Industries, Bangalore, Karnataka","Dhatir, Faridabad, Haryana","13.025282,77.510345","28.192852,77.249137",2020-07-02 09:25:54,"Unnamed Road, Khadkighat, Maharashtra 431126, ...",...,2020-06-30 21:47:14,2020-07-29 13:37:00,687.829444,,,WENEXBAN18,Wipro enterprises pvt ltd,CONSENT TRACK_AEIBK2021746_WENEXBAN18,UNKNOWN,-1
2732,CONSENT TRACK,AEIBK2021951,2020-07-03 17:44:03,DL01GC6187,"Peenya Small Industries, Bangalore, Karnataka","Dhatir, Faridabad, Haryana","13.025282,77.510345","28.192852,77.249137",2020-07-05 05:23:34,"Mumbai - Agra National Hwy, Vighneshwar Nagar,...",...,2020-07-03 17:44:03,2020-07-09 11:22:00,137.6325,,,WENEXBAN18,Wipro enterprises pvt ltd,CONSENT TRACK_AEIBK2021951_WENEXBAN18,UNKNOWN,-1
2733,EKTA,AEIBK2021962,2020-07-03 22:44:53,HR47D0664,"Irungattukottai, Kanchipuram, Tamil Nadu","Dhatir, Faridabad, Haryana","13.010768,79.993135","28.192852,77.249137",2020-07-05 12:14:57,"NH 44, Shankarapur, Telangana 504323, India",...,2020-07-03 22:44:53,2020-07-09 11:22:00,132.618611,,,WENEXBAN18,Wipro enterprises pvt ltd,EKTA_AEIBK2021962_WENEXBAN18,UNKNOWN,-1
2734,CONSENT TRACK,AEIBK2022212,2020-07-08 11:53:00,RJ14GF3519,"Peenya Small Industries, Bangalore, Karnataka","Mahindra World City, Jaipur, Rajasthan","13.025282,77.510345","27.033987,75.776267",2020-07-09 22:59:54,"Unnamed Road, Salhawas, Haryana 123401, India",...,2020-07-08 11:53:00,2020-07-09 11:22:00,23.483333,,,WENEXBAN18,Wipro enterprises pvt ltd,CONSENT TRACK_AEIBK2022212_WENEXBAN18,UNKNOWN,-1
5379,CONSENT TRACK,AEIBK1901943,2019-10-20 16:36:54,KA01AE9163,"Mugabala, Bangalore Rural, Karnataka","Anekal, Bangalore, Karnataka","16.560192249175344,80.792293091599547","12.777874729699617,77.642275537347089",2019-10-20 19:42:54,"Rajapura Rd, Haragadde, Karnataka 560105, India",...,2019-10-20 16:36:54,2019-10-22 09:59:00,41.368333,49.0,0.0,LTLEXMUM40,Larsen & toubro limited,CONSENT TRACK_AEIBK1901943_LTLEXMUM40,UNKNOWN,-1
5384,MANUAL,AEIBK1901336,2019-10-15 13:08:50,TN52U1029,"Adavipalem, West Godavari, Andhra Pradesh","Parandur, Kanchipuram, Tamil Nadu","16.510615,81.75956","12.890556,79.72877",2019-10-16 01:19:53,"Narasimha Rao Pet Fire Station Bus Stop, Vijay...",...,2019-10-15 13:08:50,2019-10-23 10:08:00,188.986111,716.1,275.0,LTLEXMUM40,Larsen & toubro limited,MANUAL_AEIBK1901336_LTLEXMUM40,UNKNOWN,-1
5390,MANUAL,AEIBK1901349,2019-10-15 17:49:14,AP16TH6969,"Nidamanuru, Krishna, Andhra Pradesh","Apsp Colony, Guntur, Andhra Pradesh","16.500876,80.760239","16.449872,80.532732",2019-10-15 19:03:06,"5-32, Nidamanuru Rd, Ramanagar, Nidamanuru, Vi...",...,2019-10-15 17:49:14,2019-10-21 14:25:00,140.596111,72.0,,LTLEXMUM40,Larsen & toubro limited,MANUAL_AEIBK1901349_LTLEXMUM40,UNKNOWN,-1
5412,MANUAL,AEIBK1901351,2019-10-15 17:48:55,AP16TY1967,"Nidamanuru, Krishna, Andhra Pradesh","Apsp Colony, Guntur, Andhra Pradesh","16.500876,80.760239","16.449872,80.532732",2019-10-15 19:02:47,"5-32, Nidamanuru Rd, Ramanagar, Nidamanuru, Vi...",...,2019-10-15 17:48:55,2019-10-19 03:23:00,81.568056,72.0,,LTLEXMUM40,Larsen & toubro limited,MANUAL_AEIBK1901351_LTLEXMUM40,UNKNOWN,-1


In [24]:
# 데이터 품질 검증 함수
def validate_delivery_status(df):
    """
    통합된 배송 상태 데이터의 품질 검증
    """
    print("=== 데이터 품질 검증 ===")
    
    # 전체 레코드 수
    total_records = len(df)
    print(f"\n전체 레코드 수: {total_records}")
    
    # 각 상태별 건수와 비율
    status_counts = df['delivery_status'].value_counts()
    status_percentages = (status_counts / total_records * 100).round(2)
    
    print("\n배송 상태별 분포:")
    for status in status_counts.index:
        print(f"{status}: {status_counts[status]} 건 ({status_percentages[status]}%)")
    
    # UNKNOWN 상태 확인
    unknown_records = df[df['delivery_status'] == 'UNKNOWN']
    if len(unknown_records) > 0:
        print("\n=== UNKNOWN 상태 데이터 샘플 ===")
        print(unknown_records[['ontime', 'delay', 'delivery_status']].head())
    
    return status_counts, status_percentages

In [25]:
validate_delivery_status(test)

=== 데이터 품질 검증 ===

전체 레코드 수: 6880

배송 상태별 분포:
delay: 4342 건 (63.11%)
ontime: 2524 건 (36.69%)
UNKNOWN: 14 건 (0.2%)

=== UNKNOWN 상태 데이터 샘플 ===
     ontime delay delivery_status
1514    NaN   NaN         UNKNOWN
1595    NaN   NaN         UNKNOWN
2725    NaN   NaN         UNKNOWN
2732    NaN   NaN         UNKNOWN
2733    NaN   NaN         UNKNOWN


(delivery_status
 delay      4342
 ontime     2524
 UNKNOWN      14
 Name: count, dtype: int64,
 delivery_status
 delay      63.11
 ontime     36.69
 UNKNOWN     0.20
 Name: count, dtype: float64)