In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import time
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("US_Accidents_June20.csv")

In [3]:
def reduce_mem(df): #Thanks to KEREM ÜRKMEZ
    starttime = time.time()
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isnull(c_min) or pd.isnull(c_max):
                continue
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction),time spend:{:2.2f} min'.format(end_mem,
                                                                                                           100*(start_mem-end_mem)/start_mem,
                                                                                                           (time.time()-starttime)/60))
    return df

In [4]:
df = reduce_mem(df)
df

-- Mem. usage decreased to 710.38 Mb (29.6% reduction),time spend:0.03 min


Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.87500,-84.0625,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.93750,-82.8125,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.06250,-84.0625,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.75000,-84.1875,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.62500,-84.1875,,,...,False,False,False,False,True,False,Day,Day,Day,Day
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3513612,A-3513776,Bing,,2,2019-08-23 18:03:25,2019-08-23 18:32:01,34.00000,-117.3750,34.00000,-117.375,...,False,False,False,False,False,False,Day,Day,Day,Day
3513613,A-3513777,Bing,,2,2019-08-23 19:11:30,2019-08-23 19:38:23,32.78125,-117.1250,32.75000,-117.125,...,False,False,False,False,False,False,Day,Day,Day,Day
3513614,A-3513778,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:28:49,33.78125,-117.8750,33.78125,-117.875,...,False,False,False,False,False,False,Day,Day,Day,Day
3513615,A-3513779,Bing,,2,2019-08-23 19:00:21,2019-08-23 19:29:42,34.00000,-118.3750,33.96875,-118.375,...,False,False,False,False,False,False,Day,Day,Day,Day


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float16
 3   Severity               int8   
 4   Start_Time             object 
 5   End_Time               object 
 6   Start_Lat              float16
 7   Start_Lng              float16
 8   End_Lat                float16
 9   End_Lng                float16
 10  Distance(mi)           float16
 11  Description            object 
 12  Number                 float32
 13  Street                 object 
 14  Side                   object 
 15  City                   object 
 16  County                 object 
 17  State                  object 
 18  Zipcode                object 
 19  Country                object 
 20  Timezone               object 
 21  Airport_Code           object 
 22  Weather_Timestamp 

In [6]:
df.isna().sum()

ID                             0
Source                         0
TMC                      1034799
Severity                       0
Start_Time                     0
End_Time                       0
Start_Lat                      0
Start_Lng                      0
End_Lat                  2478818
End_Lng                  2478818
Distance(mi)                   0
Description                    1
Number                   2262864
Street                         0
Side                           0
City                         112
County                         0
State                          0
Zipcode                     1069
Country                        0
Timezone                    3880
Airport_Code                6758
Weather_Timestamp          43323
Temperature(F)             65732
Wind_Chill(F)            1868249
Humidity(%)                69687
Pressure(in)               55882
Visibility(mi)             75856
Wind_Direction             58874
Wind_Speed(mph)           454609
Precipitat

ÖNEMLİ  
  

ENDLAT, ENDLNG, Number, WindChill, Precipitation çok fazla null data var direkt drop işlemi uygulanabilir.

Veri açıklamalarına göre çok da fazla bir önemleri yok.  
https://smoosavi.org/datasets/us_accidents

In [7]:
df.drop(["End_Lat", "End_Lng", "Number", "Wind_Chill(F)", "Precipitation(in)"], axis=1,inplace=True)

Açıklaalarına bakarak elenecek olanlar:

Street, City, County, Statte, Zipcode, Country, Airport_Code, Wind_Dİrection, Amenity, 

In [8]:
df.drop(["End_Time", "Street", "City", "County", "State", "Zipcode", "Country", "Airport_Code", "Wind_Direction", "Amenity"], axis=1,inplace=True)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 34 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float16
 3   Severity               int8   
 4   Start_Time             object 
 5   Start_Lat              float16
 6   Start_Lng              float16
 7   Distance(mi)           float16
 8   Description            object 
 9   Side                   object 
 10  Timezone               object 
 11  Weather_Timestamp      object 
 12  Temperature(F)         float16
 13  Humidity(%)            float16
 14  Pressure(in)           float16
 15  Visibility(mi)         float16
 16  Wind_Speed(mph)        float16
 17  Weather_Condition      object 
 18  Bump                   bool   
 19  Crossing               bool   
 20  Give_Way               bool   
 21  Junction               bool   
 22  No_Exit           

In [10]:
df.memory_usage().sum() / 1024**2

425.5576009750366

İlk iki eleme işlemi sonunda kolonların %30undan kurtulduk  
which is nice

In [None]:
df.query("")