In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
sns.set_style('darkgrid')
mpl.rcParams['figure.figsize'] = [18,10]

### What is feature selection?

- Selecting features to be used for modeling
- Doesn't create new features
- Improve models performance 

***

### Removing redundant features

- Remove noisy features
- Remove correlated features
- Remove duplicated features

### Correlated features

- Statistically correlated: features move together directionally
- Linear models assume feature independence
- Pearson correlation coefficient

```python
df.corr()
```

In [3]:
volunteer = pd.read_csv('data/volunteer_opportunities.csv')

In [4]:
# Create a list of redundant column names to drop
to_drop = ["locality", "region", "category_desc", "vol_requests", "created_date"]

# Drop those columns from the dataset
volunteer_subset = volunteer.drop(to_drop, axis=1)

# Print out the head of the new dataset
print(volunteer_subset.head())

   opportunity_id  content_id  event_time  \
0            4996       37004           0   
1            5008       37036           0   
2            5016       37143           0   
3            5022       37237           0   
4            5055       37425           0   

                                               title  hits  \
0  Volunteers Needed For Rise Up & Stay Put! Home...   737   
1                                       Web designer    22   
2      Urban Adventures - Ice Skating at Lasker Rink    62   
3  Fight global hunger and support women farmers ...    14   
4                                      Stop 'N' Swap    31   

                                             summary is_priority  category_id  \
0  Building on successful events last summer and ...         NaN          NaN   
1             Build a website for an Afghan business         NaN          1.0   
2  Please join us and the students from Mott Hall...         NaN          1.0   
3  The Oxfam Action Corps is a g

In [5]:
wine = pd.read_csv('data/wine_types.csv')

In [6]:
wine.corr()

Unnamed: 0,Type,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
Type,1.0,-0.328222,0.437776,-0.049643,0.517859,-0.209179,-0.719163,-0.847498,0.489109,-0.49913,0.265668,-0.617369,-0.78823,-0.633717
Alcohol,-0.328222,1.0,0.094397,0.211545,-0.310235,0.270798,0.289101,0.236815,-0.155929,0.136698,0.546364,-0.071747,0.072343,0.64372
Malic acid,0.437776,0.094397,1.0,0.164045,0.2885,-0.054575,-0.335167,-0.411007,0.292977,-0.220746,0.248985,-0.561296,-0.36871,-0.192011
Ash,-0.049643,0.211545,0.164045,1.0,0.443367,0.286587,0.12898,0.115077,0.18623,0.009652,0.258887,-0.074667,0.003911,0.223626
Alcalinity of ash,0.517859,-0.310235,0.2885,0.443367,1.0,-0.083333,-0.321113,-0.35137,0.361922,-0.197327,0.018732,-0.273955,-0.276769,-0.440597
Magnesium,-0.209179,0.270798,-0.054575,0.286587,-0.083333,1.0,0.214401,0.195784,-0.256294,0.236441,0.19995,0.055398,0.066004,0.393351
Total phenols,-0.719163,0.289101,-0.335167,0.12898,-0.321113,0.214401,1.0,0.864564,-0.449935,0.612413,-0.055136,0.433681,0.699949,0.498115
Flavanoids,-0.847498,0.236815,-0.411007,0.115077,-0.35137,0.195784,0.864564,1.0,-0.5379,0.652692,-0.172379,0.543479,0.787194,0.494193
Nonflavanoid phenols,0.489109,-0.155929,0.292977,0.18623,0.361922,-0.256294,-0.449935,-0.5379,1.0,-0.365845,0.139057,-0.26264,-0.50327,-0.311385
Proanthocyanins,-0.49913,0.136698,-0.220746,0.009652,-0.197327,0.236441,0.612413,0.652692,-0.365845,1.0,-0.02525,0.295544,0.519067,0.330417


In [7]:
# Take a minute to find the column where the correlation value is greater than 0.75 at least twice
to_drop = "Flavanoids"

# Drop that column from the DataFrame
wine = wine.drop(to_drop, axis=1)

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

volunteer = pd.read_csv('data/volunteer_opportunities.csv')

volunteer = volunteer.dropna(subset=['category_desc'])

documents = volunteer.summary

In [9]:
tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(documents)

In [10]:
print(list(tfidf_vec.vocabulary_.items())[:20])

[('build', 472), ('website', 3034), ('for', 1186), ('an', 233), ('afghan', 183), ('business', 477), ('please', 2140), ('join', 1582), ('us', 2955), ('and', 236), ('the', 2809), ('students', 2714), ('from', 1220), ('mott', 1851), ('hall', 1337), ('on', 1965), ('saturday', 2471), ('january', 1575), ('29th', 58), ('ice', 1447)]


In [11]:
print(text_tfidf[3].data)

[0.13462896 0.1479415  0.21897898 0.14909988 0.17291077 0.13082327
 0.14909988 0.21211561 0.22737907 0.21211561 0.09069305 0.41262542
 0.22737907 0.18021965 0.22737907 0.21897898 0.10658498 0.22737907
 0.20631271 0.12736723 0.22737907 0.22737907 0.22737907 0.12300682
 0.05449404 0.05060413 0.13763268]


In [12]:
print(text_tfidf[3].indices)

[ 208 1078 2414 1208 1228 2822 1738  881 1179 2617 1373 1569 2956  481
 2944 1408 1899 1156 3023 1934 2332 2755 2688  483 2847  236 1186]


In [13]:
vocab = {v:k for k,v in tfidf_vec.vocabulary_.items()}
#print(vocab)

In [14]:
zipped_row = dict(zip(text_tfidf[3].indices, text_tfidf[3].data))

In [15]:
print(zipped_row)

{208: 0.1346289617812831, 1078: 0.14794150479176504, 2414: 0.2189789807778424, 1208: 0.14909988332046015, 1228: 0.17291077238882824, 2822: 0.13082326601471828, 1738: 0.14909988332046015, 881: 0.21211560760073897, 1179: 0.22737907066114862, 2617: 0.21211560760073897, 1373: 0.09069304867582254, 1569: 0.4126254165261812, 2956: 0.22737907066114862, 481: 0.18021964581619443, 2944: 0.22737907066114862, 1408: 0.2189789807778424, 1899: 0.10658497928445922, 1156: 0.22737907066114862, 3023: 0.2063127082630906, 1934: 0.1273672321694633, 2332: 0.22737907066114862, 2755: 0.22737907066114862, 2688: 0.22737907066114862, 483: 0.12300682087356397, 2847: 0.05449403606411541, 236: 0.05060412885055118, 1186: 0.13763268361805742}


In [16]:
def return_weights(vocab, vector, vector_index):
    zipped = dict(zip(vector[vector_index].indices, 
                     vector[vector_index].data))
    return {vocab[i]: zipped[i] for i in vector[vector_index].indices}

print(return_weights(vocab, text_tfidf, 3))

{'all': 0.1346289617812831, 'experience': 0.14794150479176504, 'reuse': 0.2189789807778424, 'free': 0.14909988332046015, 'fun': 0.17291077238882824, 'this': 0.13082326601471828, 'make': 0.14909988332046015, 'display': 0.21211560760073897, 'fold': 0.22737907066114862, 'sort': 0.21211560760073897, 'help': 0.09069304867582254, 'items': 0.4126254165261812, 'usable': 0.22737907066114862, 'but': 0.18021964581619443, 'unwanted': 0.22737907066114862, 'homes': 0.2189789807778424, 'new': 0.10658497928445922, 'finding': 0.22737907066114862, 'waste': 0.2063127082630906, 'nyc': 0.1273672321694633, 'reduces': 0.22737907066114862, 'swap': 0.22737907066114862, 'stop': 0.22737907066114862, 'by': 0.12300682087356397, 'to': 0.05449403606411541, 'and': 0.05060412885055118, 'for': 0.13763268361805742}


In [17]:
documents = volunteer.title

tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(documents)

In [18]:
vocab = {v:k for k,v in tfidf_vec.vocabulary_.items()}

In [19]:
# Add in the rest of the parameters
def return_weights(vocab, original_vocab, vector, vector_index, top_n):
    zipped = dict(zip(vector[vector_index].indices, vector[vector_index].data))
    
    # Let's transform that zipped dict into a series
    zipped_series = pd.Series({vocab[i]:zipped[i] for i in vector[vector_index].indices})
    
    # Let's sort the series to pull out the top n weighted words
    zipped_index = zipped_series.sort_values(ascending=False)[:top_n].index
    return [original_vocab[i] for i in zipped_index]

# Print out the weighted words
print(return_weights(vocab, tfidf_vec.vocabulary_, text_tfidf, 8, 3))

[189, 942, 466]


In [20]:
def words_to_filter(vocab, original_vocab, vector, top_n):
    filter_list = []
    for i in range(0, vector.shape[0]):
        
        # Here we will call the return_weights function
        filtered = return_weights(vocab, original_vocab, vector, i, top_n)
        filter_list.extend(filtered)
        
    # Return the list in a set, so we don't get duplicate word indices
    return set(filter_list)
    
# Call the function to get the list of word indices 
filtered_words = words_to_filter(vocab, tfidf_vec.vocabulary_, text_tfidf, 3)

# By converting filtered_words back into a list, we can use it to filter the columns in the text vector
filtered_text = text_tfidf[:, list(filtered_words)]

In [21]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()

y = volunteer.category_desc

In [22]:
# Split the dataset according to the class distributions of category_desc, using the filtered_text vector
train_X, test_X, train_y, test_y = train_test_split(filtered_text.toarray(), y, stratify=y)

# Fit the model to the training data
nb.fit(train_X, train_y)

# Print out the model's accuracy
print(nb.score(test_X, test_y))

0.5419354838709678


### Dimensionality reduction
- Unsupervised learning method
- Combines/decomposes a feature space
- Feature extraction - here we'll use to reduce our feature space
- Principle component analysis
- Linear transformation to uncorrelated space
- Captures as much variance as possible in each component

### PCA in scikit-learn
```python
from sklearn.decompostion import PCA
pca = PCA()
df_pca = pca.fit_transform(df_)
```

In [28]:
from sklearn.decomposition import PCA

# Set up PCA and the X vector for diminsionality reduction
pca = PCA()
wine_X = wine.drop("Type", axis=1)

# Apply PCA to the wine dataset X vector
transformed_X = pca.fit_transform(wine_X)

print(pca.explained_variance_ratio_)

[9.98098798e-01 1.73593305e-03 9.43282757e-05 4.89438533e-05
 1.04695097e-05 5.60981698e-06 2.79968212e-06 1.44536313e-06
 9.75418873e-07 3.94184513e-07 2.13661389e-07 8.91974959e-08]


In [29]:
!../gitbsh > /dev/null 2>&1