# Optional learning notebook about chi-squared

In [1]:
import string
import re
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

from nltk.tokenize import WordPunctTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

seed = 42

## 1. Manual chi-squared for the Twitter data
Let's make functions to apply the manual $\chi^2$ calculation to the Twitter data. Recall that $\chi^2 = \sum{\frac{(O_{x_1x_2} - E_{x_1x_2})^2}{E_{x_1x_2}}}$.

In [2]:
def chi_squared(counts):
    """
    Non vectorized version of chi squared function - the idea is that you see the relation with the formula above, 
    but you should never use such an inefficient version when actually performing a chi-squared analysis
    """
    print("Applying chi-squared to {} feature and {} classes".format(counts.shape[0], counts.shape[1]))
    chi_values = np.zeros(counts.shape)
    for i in range(counts.shape[0]):
        for j in range(counts.shape[1]):
            n = counts.sum()
            c_tc = counts[i,j]
            c_tx = counts.sum(axis=1)[i,0]-c_tc
            c_xc = counts.sum(axis=0)[0,j]-c_tc
            c_xx = n-c_tc-c_tx-c_xc
            chi_values[i,j] = n*(((c_tc*c_xx)-(c_tx*c_xc))**2)/((c_tc+c_xc)*(c_tx+c_xx)*(c_tc+c_tx)*(c_xc+c_xx))
    return chi_values

def chi_squared_vect(counts):
    """
    Vectorized version of chi squared function - this is still a non-optimized version, but it should run faster than 
    the previous function
    """
    print("Applying chi-squared to {} feature and {} classes".format(counts.shape[0], counts.shape[1]))
    n = counts.sum()
    c_tc = counts
    c_tx = counts.sum(axis=1)-counts
    c_xc = counts.sum(axis=0)-counts
    c_xx = n * np.ones(counts.shape) - counts - c_tx - c_xc
    num = n * np.square(np.multiply(c_tc, c_xx)-np.multiply(c_tx, c_xc))
    den = np.multiply(np.multiply(np.multiply(c_tc+c_xc, c_tx+c_xx), c_tc+c_tx), c_xc+c_xx)
    chi_values = np.divide(num, den)
    return chi_values

Now we'll apply our functions to the data and select the most important features. The features with higher chi2 values are the more important ones, i.e, the ones that are more dependent on the label.

In [3]:
stat_df = pd.read_csv('./data/twitter_rep_dem_data_small.csv')

hashtag_removal = lambda doc: re.subn(r'#\w+','', doc.lower())[0]
handle_removal = lambda doc: re.subn(r'@\w+','', doc.lower())[0]
simple_tokenizer = lambda doc: " ".join(WordPunctTokenizer().tokenize(doc))

stat_df['Tweet'] = stat_df['Tweet'].map(handle_removal)
stat_df['Tweet'] = stat_df['Tweet'].map(hashtag_removal)
stat_df['Tweet'] = stat_df['Tweet'].map(simple_tokenizer)

train_data, test_data = train_test_split(stat_df, test_size=0.3, random_state=seed)

In [4]:
small_vectorizer = TfidfVectorizer(ngram_range=(1,2))
small_vectorizer.fit(train_data.Tweet)
small_X_train = small_vectorizer.transform(train_data.Tweet)
small_y_train = train_data.Party

idx_rep = np.where(small_y_train=='Republican') 
idx_dem = np.where(small_y_train=='Democrat') 

counts_rep = small_X_train[idx_rep[0], :].sum(axis=0)
counts_dem = small_X_train[idx_dem[0], :].sum(axis=0)
counts = np.concatenate((counts_rep, counts_dem))

chi_values_vect = chi_squared_vect(counts.transpose())

feature_names = small_vectorizer.get_feature_names_out()

best_features = chi_values_vect.argsort(axis=0).tolist()

print("Most important features and their chi-squared:\n")
for idx in sorted(best_features[-10:]):
    print(u"{}: {}".format(feature_names[idx[0]], chi_values_vect[idx[0], 0]))

Applying chi-squared to 131301 feature and 2 classes
Most important features and their chi-squared:

chairman: 25.628925195452286
code: 12.98547983333508
gun: 11.882957002831544
hearing: 14.628669701803808
pruitt: 9.71117632814275
reform: 14.129548514719188
tax: 22.00286461780654
tax code: 11.748658006980634
tax reform: 14.235518157444455
texas: 14.780349285278556


Nice, we got the same results as sklearn!

## 2. Generalization for a 2x2 contingency table

In some places you will see that the chi-squared formula for a 2x2 contingency table can be written in the form

$$ \frac{N(AD-BC)^2}{(A+B)(A+C)(B+D)(C+D))} $$

if we represent the values in the table like this:

|                      | C_1     |     C_2      |    Total    |
|----------------------|---------|--------------|-------------|
|         F_1          |   A     |      B       |  A+B        |
|         F_2          |   C     |      D       |  C+D        |
|         total        |   A+C   |      B+D     |  N=A+B+C+D  |

First, remember that the expected value for a cell, let's say $E_A$, is computed from the $P(C_1)$ and $P(F_1)$, like this:

$$ E_A = N\ P(F_1)P(C_1) = \quad N\ \frac{A+B}{N}\frac{A+C}{N} =\quad \frac{(A+B)(A+C)}{N}$$

So taking the difference $O_A - E_A$ simplifies to:

$$ O_A - E_A = \\ A - \frac{(A+B)(A+C)}{N} = \\ A - \frac{A^2+AB + AC + BC \ (+ AD - AD)}{N} = \\
A - \frac{A\ (A + B + C + D) + (BC - AD)}{N} = \\ A - \frac{A\ N + (BC - AD)}{N} = \\ A - A + \frac{(BC - AD)}{N} = \\
\frac{(BC - AD)}{N}$$

If you write down the expression for cells B, C and D, you will get:

$$ E_A = \frac{(A+B)(A+C)}{N} \\ E_B = \frac{(B+D)(A+B)}{N} \\ E_C = \frac{(A+C)(C+D)}{N}\\ E_D = \frac{(B+D)(C+D)}{N}$$


$$ (O_A - E_A) = (O_C - E_C) = \frac{(BC - AD)}{N} \\ (O_B - E_B) = (O_D - E_D) = \frac{(AD - BC)}{N} $$

And putting it all together we get

$$ \chi^2 = \frac{(O_A - E_A)^2}{E_A} + \frac{(O_B - E_B)^2}{E_B} + \frac{(O_C - E_C)^2}{E_C} + \frac{(O_D - E_D)^2}{E_D} $$

$$ \chi^2 = \frac{(\frac{(BC - AD)}{N})^2}{\frac{(A+B)(A+C)}{N}} + \frac{(\frac{(AD - BC)}{N})^2}{\frac{(B+D)(A+B)}{N}} + \frac{(\frac{(AD - BC)}{N})^2}{\frac{(A+C)(C+D)}{N}} + \frac{(\frac{(BC - AD)}{N})^2}{\frac{(B+D)(C+D)}{N}} $$

$$ \chi^2 = \frac{(B+D)(C+D)(\frac{(BC - AD)}{N})^2}{\frac{(A+B)(A+C)(B+D)(C+D)}{N}} + \frac{(A+C)(C+D)(\frac{(AD - BC)}{N})^2}{\frac{(A+B)(A+C)(B+D)(C+D)}{N}} + \frac{(A+B)(B+D)(\frac{(AD - BC)}{N})^2}{\frac{(A+B)(A+C)(B+D)(C+D)}{N}} + \frac{(A+B)(A+C)(\frac{(BC - AD)}{N})^2}{\frac{(A+B)(A+C)(B+D)(C+D)}{N}} $$

Notice that

$$(BC - AD)^2 = ((-1)(AD - BC))^2 = (AD - BC)^2$$

so the expression simplifies to

$$ \frac{N [(B+D)(C+D) + (A+C)(C+D) + (A+B)(B+D) + (A+B)(A+C)](\frac{(AD - BC)}{N})^2}{(A+B)(A+C)(B+D)(C+D)} $$

The denominator is already there, let's simplify the numerator.

$$ \frac{N [BC + BD + CD + D^2 + AC + AD + C^2 + CD + AB + AD + B^2 + BD + A^2 + AC + AB + BC](\frac{(AD - BC)}{N})^2}{(A+B)(A+C)(B+D)(C+D)} $$

$$ \frac{N [A ( A + B + C + D ) + B (A + B + C + D) + C (A + B + C + D) + D (A + B + C + D)](\frac{(AD - BC)}{N})^2}{(A+B)(A+C)(B+D)(C+D)} $$

$$ \frac{N [( A + B + C + D ) ( A + B + C + D )](\frac{(AD - BC)}{N})^2}{(A+B)(A+C)(B+D)(C+D)} $$

$$ \frac{N (N^2)(\frac{(AD - BC)}{N})^2}{(A+B)(A+C)(B+D)(C+D)} $$

$$ \frac{N (N^2)(\frac{(AD - BC)^2}{N^2})}{(A+B)(A+C)(B+D)(C+D)} $$

$$ \frac{N (AD - BC)^2}{(A+B)(A+C)(B+D)(C+D)} $$