In [None]:
import pandas as pd

In [None]:
pd.read_csv()
pd.read_csv(header=None)

In [None]:
#data cleaning, data wrangling

In [None]:
#check distribution
df.describe() # only numeric
df.describe(include="all") # every column
df[['cola','colb')]].describe() # select columns
df.info()

In [None]:
#make sure data types are proper
df.dtypes

In [None]:
#fill in headers if missing
headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]
df.columns = headers

In [None]:
#identify and handle missing values

In [None]:
#Options include:
#Try to find the missing data by consulting the source.
#Remove the data that has the missing piece.  the whole variable or just the row with the missing element.
#Replace the missing data with a guess, such as the average or most common value of the other data.
#The source of the data might have extra knowledge about the data.  Something like "that field is often missing on older cars".
#Leave it missing.

In [None]:
#dropping rows with missing data.
#axis=0 means drop an entire row
#axis=1 means drop an entire column
#inplace=True means write the results back into the dataframe.
#in this example it is dropping a row from the "price" column.
df.dropna(subset=["price"], axis=0, inplace=True)
#doing this without inplace=True won't change the data, but can display it during testing.

In [None]:
#data formatting
#standardize the values into the same format, unit, or convention.

In [None]:
#Adjusting column data for every sample (row) in the column.
#This is just adding one, but it could also do things like convert miles to kilometers, etc.
#Convert New York to NY, etc.
df["symboling"] = df['symboling']+1

#another example:
df["city-mpg"]= 235/df["city-mpg"]
df.rename(columns={"city_mpg": "city-L/100km"}, inplace=True)  # Change the column heading to reflect what changed.

#numbers of type object needing to be converted to a number
df["price"] = df["price"].astype("int")


In [None]:
#replacing missing values with the mean value
mean = df["normalized-losses"].mean()
df["normalized-losses"].replace(np.nan, mean)

In [None]:
#data normalization (centering / scaling)

In [None]:
# It may involve changing all values for a column to be between zero and one retaining same proportions with each other.

In [None]:
#simple scaling
df["length"] = df["length"]/df["length"].max()
#min-max
df["length"] = (df["length"]-df["length"].min())/(df["length"].max()-df["length"].min())
#z-score.  Mostly -3 to +3.
df["length"] = (df["length"]-df["length"].mean())/df["length"].std()

In [None]:
#save the adjusted data
df.to_csv("automobile.csv", index=False)

In [None]:
#data binning

In [None]:
#Bins covering equal sized ranges of the range of a value.
bins = np.linspace(min(df["price"]), max(df["price"]), 4)  # four equally spaced numbers covering the whole range of prices.
    #which results in two on each end, and then two more that break the range into 3 equal areas.
group_names["Low","Medium","High"]
df["price-binned"]=pd.cut(df["price"], bins, labels=group_names, include_lowest=True)

In [None]:
#Turn categorical values to quantitative (numeric) variables

In [None]:
#One-hot encoding adds a column with a 0,1 flag for each category in another column.
pd.get_dummies(df['fuel'])  #converts (gas, diesel) into two separate flag columns for gas and diesel.