# Insights_from_dataframes

### Look for correlations

In [None]:
df.corr()
# or
df.corr()['price'].sort_values()

### Check statistics

In [None]:
df.describe() # or df[['column1','column2', ...]].describe() ## descriptions
df.describe(include=['object']) # if there are objects it does statistic on them as well
df.describe(include = "all") # more descriptions

### Variable vs Variable scatter plots check

In [None]:
attributes = ["median_house_value", "median_income", "housing_median_age", "rooms_per_household"]
scatter_matrix(housing[attributes], figsize=(12, 8))

### Grouping

In [None]:
df_group_one = df_group_one.groupby(['drive-wheels'],as_index=False).mean()  # group by a certrain column (except the last) and average by the last column

### Pivoting tables (of groups)

In [None]:
# here we have grouping by two groups
df_gptest = df[['drive-wheels','body-style','price']]
grouped_test1 = df_gptest.groupby(['drive-wheels','body-style'],as_index=False).mean() 

# thus it will be easier to pivot the table to understand it
grouped_pivot = grouped_test1.pivot(index='drive-wheels',columns='body-style')

### Color-code plot of the pivot table 

In [None]:
plt.pcolor(grouped_pivot, cmap='RdBu')
plt.colorbar()
plt.show()

### and now plot it nicely ###
fig, ax = plt.subplots()
im = ax.pcolor(grouped_pivot, cmap='RdBu')

#label names
row_labels = grouped_pivot.columns.levels[1]
col_labels = grouped_pivot.index

#move ticks and labels to the center
ax.set_xticks(np.arange(grouped_pivot.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(grouped_pivot.shape[0]) + 0.5, minor=False)

#insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)

#rotate label if too long
plt.xticks(rotation=90)

fig.colorbar(im)
plt.show()

### Pearson coefficient

In [None]:
pearson_coef, p_value = stats.pearsonr(df['wheel-base'], df['price'])
print("The Pearson Correlation Coefficient is", pearson_coef, " with a P-value of P =", p_value)  

##### **Important note:** 
Correlation coefficient tells you wheter there exist a linear relationship between 2 variables (>0,5 or 0,7 tends to show acceptable liner relationship). 

While he p-value is a measure of the trust-worthiness of this hyphotesis

### The ANOVA test

In [None]:
# drive-wheels is 3 kinds of objects (categorical var) and price is a continuous numerical value (float)
grouped_test2=df[['drive-wheels', 'price']].groupby(['drive-wheels'])

f_val, p_val = stats.f_oneway(grouped_test2.get_group('fwd')['price'], grouped_test2.get_group('rwd')['price'], grouped_test2.get_group('4wd')['price'])  
print( "ANOVA results: F=", f_val, ", P =", p_val)   

### Plotting and comparing distributions

In [None]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
    ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Proportion of Cars')

    plt.show()
    plt.close()

In [None]:
# show the distribution of the experimental vs the TRAIN set
Title = 'Distribution  Plot of  Predicted Value Using Training Data vs Training Data Distribution'
DistributionPlot(y_train, yhat_train, "Actual Values (Train)", "Predicted Values (Train)", Title)

In [None]:
# show the distribution of the experimental vs the TEST set
Title='Distribution  Plot of  Predicted Value Using Test Data vs Data Distribution of Test Data'
DistributionPlot(y_test,yhat_test,"Actual Values (Test)","Predicted Values (Test)",Title)