In [19]:
import pandas as pd
import numpy as np

# plots
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

DATA_FOLDER = '.\\data\\'
DATASET = DATA_FOLDER+"dataset.csv"

# example read
df = pd.read_csv(DATASET, compression='infer', sep='\t')

<span style="color:red"> 
        Colored comment
</span>

##### Spark

In [None]:
# Assuming Spark session is created (spark), and Spark context is created (sc)

# Read JSON file and create a DataFrame
df = spark.read.json("dataset.json.gz")

# Convert DataFrame to RDD
rdd = df.rdd

**Transformations**

In [None]:
# Squaring each element in the RDD
squared_rdd = rdd.map(lambda x: x ** 2)

# Filtering even numbers from the RDD
even_rdd = rdd.filter(lambda x: x % 2 == 0)

# Flattening each element into a range of three numbers
words_rdd = rdd.flatMap(lambda x: list(range(x, x + 3)))

# Sampling the RDD without replacement, keeping 50% of the data
sampled_rdd = rdd.sample(withReplacement=False, fraction=0.5)

# Union of two RDDs
union_rdd = rdd1.union(rdd2)

# Intersection of two RDDs
intersect_rdd = rdd1.intersection(rdd2)

# Distinct elements in the RDD
distinct_rdd = rdd.distinct()

### PAIRS ###

# Creating a Pair RDD and grouping values by key
pair_rdd = sc.parallelize([(1, 'a'), (2, 'b'), (1, 'c')])
grouped_rdd = pair_rdd.groupByKey()

# Reducing by key (summing values with the same key)
sum_rdd = pair_rdd.reduceByKey(lambda x, y: x + y)

# Sorting the Pair RDD by key
sorted_rdd = pair_rdd.sortByKey()

# Creating another Pair RDD and performing an inner join
rdd3 = sc.parallelize([(1, 'apple'), (2, 'banana'), (3, 'orange')])
joined_rdd = pair_rdd.join(rdd3)

**Actions**

In [None]:
# Collect all elements of the squared_rdd 
collected_data = squared_rdd.collect()

# Count the total number of elements
count = rdd.count()

# Take the first three elements from the rdd
first_three = rdd.take(3)

# Save the elements of squared_rdd as a text file
squared_rdd.saveAsTextFile("output/squared_numbers")

# Count the occurrences of each key in the pair_rdd
key_counts = pair_rdd.countByKey()

# Compute statistics about the elements in the rdd, such as mean, variance, etc.
rdd_stats = rdd.stats()

##### BeautifulSoup

In [6]:
import requests
from bs4 import BeautifulSoup

# example URL
URL = 'http://dblp.uni-trier.de/pers/hd/v/Vetterli:Martin'

# get the parsed html from URL
r = requests.get(URL)
soup = BeautifulSoup(r.text, 'html.parser')

# get the parsed elements with find_all
all_links = soup.find_all('a')
print('Total number of links: {0}'.format(len(all_links)))

all_items = soup.find_all('li', class_='entry')
print('Total number of items: {0}'.format(len(all_items)))

Total number of links: 15791
Total number of items: 464


##### Spacy

In [3]:
import spacy

# Load the English language model
nlp = spacy.load("en_core_web_sm")

# Example sentence
example = 'I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight.'

# Tokenization
tokens = nlp(example)

# tokenize
tokenized = [token.text for token in tokens]
print(tokenized)

# remove stop words
stop_word_removed = [token for token in tokens if not token.is_stop]
print(stop_word_removed)

# Lemmatize
lemmatized = [token.lemma_ for token in tokens if not token.is_stop]
print(lemmatized)

['I', 'am', 'already', 'far', 'north', 'of', 'London', ',', 'and', 'as', 'I', 'walk', 'in', 'the', 'streets', 'of', 'Petersburgh', ',', 'I', 'feel', 'a', 'cold', 'northern', 'breeze', 'play', 'upon', 'my', 'cheeks', ',', 'which', 'braces', 'my', 'nerves', 'and', 'fills', 'me', 'with', 'delight', '.']
[far, north, London, ,, walk, streets, Petersburgh, ,, feel, cold, northern, breeze, play, cheeks, ,, braces, nerves, fills, delight, .]
['far', 'north', 'London', ',', 'walk', 'street', 'Petersburgh', ',', 'feel', 'cold', 'northern', 'breeze', 'play', 'cheek', ',', 'brace', 'nerve', 'fill', 'delight', '.']


##### Geometric mean
- used when the data is heavy tailed

In [8]:
from typing import List, Union

def geometric_mean(col: Union[List, pd.Series]) -> float:
    """
    Calculate the geometric mean of a list or pandas Series containing numerical values.

    Parameters:
    - col (Union[List, pd.Series]): Input data, a list or pandas Series of numerical values.

    Returns:
    - float: Geometric mean of the input data.
    """
    return np.exp(np.log(col).mean())

# Example usage:
data = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
print(geometric_mean(data))

9.057457376233529


##### Correlations
- Pearson
    - works best in linear correlations
- Spearman
    - robust to outliers
    - relies on the rank of datapoints (rather than on actual values)
    - works best with [monotonic](https://en.wikipedia.org/wiki/Monotonic_function) functions

In [12]:
from scipy import stats                     # statistical correlation - .spearmanr, .pearsonr   
                                            # comparison - .ttest_ind

from statsmodels.stats import diagnostic    # check data disctribution - .kstest_normal / .kstest_exponential

import statsmodels.formula.api as smf       # ordinary least square - smf.ols(formula, data).fit().summary()
                                            # logistic least square - smf.logit(formula, data).fit().summary()

##### Bootstrap CI

In [11]:
def bootstrap_confidence_interval(array, iterations = 1000, ci_level = 95):
    """
    Bootstrap the 95% confidence interval for the mean of the data.
    
    Parameters:
    - data: An array of data
    - iterations: The number of bootstrap samples to generate
    - ci_level: CI percentage
    
    Returns:
    - A tuple representing the lower and upper bounds of the 95% confidence interval
    """
    means = np.zeros(iterations)
    alpha = 100 - ci_level
    
    for i in range(iterations):
        bootstrap_sample = np.random.choice(a=array, size=len(array), replace=True)
        means[i] = np.mean(bootstrap_sample)

    lower = np.percentile(means, alpha / 2)
    upper = np.percentile(means, 100 - alpha / 2)
    
    return lower, upper
# Example usage:
data = [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
print(bootstrap_confidence_interval(data))

(7.6, 14.8)


##### Cool groupby
```python
stats_by_year = movies.groupby(movies['year']).apply(lambda x: pd.Series({
        'average_worldwide_gross': x['worldwide_gross'].mean(),
        'std_dev_worldwide_gross': x['worldwide_gross'].std()
    }))
```
same as
```python
movies.groupby('year')['worldwide_gross'].agg(['mean','std']).rename({
        'mean':'average_worldwide_gross',
        'std':'std_dev_worldwide_gross'
    },axis=1)
```
NOTE: ['worldwide_gross'] -- NOT [['worldwide_gross']]


##### Linear regression analysis - __04-Regression analysis__

**Interation term e.g.**\
Suppose you are modeling the relationship between income (X1​), education (X2​), and job satisfaction (Y). The interaction term X1​×X2​ would capture whether the effect of income on job satisfaction depends on the level of education.\
If it​ is significant, it suggests that the effect of income (X1​) on the job satisfaction (Y) depends on the level of the education (X2​).

**R squared**\
R-Squared determines the proportion of variance in the dependent variable (Y) that can be explained by the independent variable. In other words, r-squared shows how well the data fit the regression model (the goodness of fit). R-squared can take any values between 0 to 1.\
For example, an r-squared of 60% reveals that 60% of the variability observed in the target variable is explained by the regression model. Generally, a higher r-squared indicates more variability is explained by the model.

##### Propensity score matching

In [13]:
import networkx as nx

def propensity_score_matching(df: pd.DataFrame,
                 treat_column: str,
                 continuous_features: List[str] = [],
                 categorical_features: List[str] = []) -> List[int]:
    """
    Balances a dataset based on propensity scores for treatment and control groups.

    Parameters:
    - df (pd.DataFrame): The input DataFrame containing the original data.
    - treat_column (str): The column indicating treatment assignment (1 for treatment, 0 for control).
    - continuous_features (List[str]): List of names of continuous features to be standardized.
    - categorical_features (List[str]): List of names of categorical features for logistic regression.

    Returns:
    - List[int]: List of indices of matched instances in the original DataFrame.
    """
    assert (len(continuous_features) != 0) | (len(categorical_features) != 0), 'no feature passed to be matched on'
    # Copy the df to avoid modifying the original dataframe
    data = df[[treat_column] + continuous_features + categorical_features]
    
    # Standardize the continuous features
    for column in continuous_features:
        data[column] = (data[column] - data[column].mean())/data[column].std()

    # Get the formula
    formula = ' + '.join(continuous_features + categorical_features)    
    formula = f"{treat_column} ~ {formula}"

    # Estimate propensity scores
    mod = smf.logit(formula=formula, data=data)
    res = mod.fit()
    data['Propensity_score'] = res.predict()

    # Calculate similarity
    def get_similarity(propensity_score1, propensity_score2):
        '''Calculate similarity for instances with given propensity scores'''
        return 1-np.abs(propensity_score1-propensity_score2)

    # Split into control/treat groups 
    treatment_df = data[data[treat_column] == 1]
    control_df = data[data[treat_column] == 0]

    # Balance the dataset 
    G = nx.Graph()
    for control_id, control_row in control_df.iterrows():
        for treatment_id, treatment_row in treatment_df.iterrows():

            similarity = get_similarity(control_row['Propensity_score'], treatment_row['Propensity_score'])
            G.add_weighted_edges_from([(control_id, treatment_id, similarity)])

    matching = nx.max_weight_matching(G)
    matched = [i[0] for i in list(matching)] + [i[1] for i in list(matching)]
    
    return matched

##### Machine Learning
**Confusion Matrix**
```python
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

cm = confusion_matrix(y_test,y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

```


**Cross validation best score**

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge, LogisticRegression

## CV regression
reg = Ridge()
param_grid = {'alpha':[1e-1, 1e-2, 1e-3]}
cv_model = GridSearchCV(reg, param_grid=param_grid, cv=3)

## CV classification
# clf = LogisticRegression(random_state=42)
# param_grid = {'C':[1,10,100]}
# cv_model = GridSearchCV(clf, param_grid=param_grid, cv=3)

cv_model = cv_model.fit(X_train, y_train)
print(cv_model.best_score_)
print("The optimal alpha is: %r" % (cv_model.best_params_['alpha']))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def train_model(X,y,C):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

    model = LogisticRegression(max_iter=2000, C=C).fit(X_train, y_train)
    y_predicted = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_predicted)

    print('Test Accuracy:',round(accuracy,2))

In [15]:
from sklearn import metrics

def evaluate(y_true, y_predicted):
    """
    Calculate classification metrics.

    Parameters:
    - y_true: True labels.
    - y_predicted: Predicted labels.

    Returns:
    - Dictionary containing various classification metrics.
    """
    confusion_matrix = metrics.confusion_matrix(y_true, y_predicted)
    tn, fp, fn, tp = confusion_matrix.ravel()

    precision = metrics.precision_score(y_true, y_predicted)
    recall = metrics.recall_score(y_true, y_predicted)
    accuracy = metrics.accuracy_score(y_true, y_predicted)
    f1_score = metrics.f1_score(y_true, y_predicted)

    return {
        'True Positive': tp,
        'False Positive': fp,
        'True Negative': tn,
        'False Negative': fn,
        'Precision': precision,
        'Recall': recall,
        'Accuracy': accuracy,
        'F1 Score': f1_score
    }

**Clustering imports**
```python
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN (see lab 8 for DBSCAN)
```

**PCA**
```python
X_reduced_pca = PCA(n_components=2).fit_transform(X)
```

**TSNE**
```python
X_reduced_tsne = TSNE(n_components=2, init='random', learning_rate='auto', random_state=0).fit_transform(X)
```

**Normalize**
```python
scaler = StandardScaler()
X_norm = scaler.fit_transform(X)
pd.DataFrame(X_norm)
```

**Example of Clustering Pipeline**
```python
# rescale
X_wine = StandardScaler().fit_transform(wine)

# best k
silhouette = []
for k in range(2,11):
    labels = KMeans(n_clusters=k,random_state=42).fit_predict(X_wine)
    score = silhouette_score(X_wine,labels)
    silhouette.append([k,score])

argmax = np.array(silhouette)[:,1].argmax()
k_best = silhouette[argmax][0]

# pca
pca_wine = PCA(n_components=2).fit_transform(X_wine)

# kmeans
labels_wine = KMeans(n_clusters=k_best,random_state=42).fit_predict(X_wine)

# plot
plt.scatter(pca_wine[:,0],pca_wine[:,1],c=labels_wine)
plt.title('PCA')

plt.show()
```

##### n-grams with frequency

In [44]:
from nltk import ngrams
from collections import Counter

def generate_ngrams_with_frequency(words, n, n_common: int = None):
    """
    Generate n-grams from a list of words using nltk and calculate their frequencies.

    Parameters:
    - words (list): List of words.
    - n (int): Size of the n-grams.
    - n_common: Number of most common n-grams

    Returns:
    - List of tuples containing n-grams and their frequencies.
    """
    if n > 1:
        words = list(ngrams(words, n))
    n_gram_counts = Counter(words)
    if n_common is not None: 
        n_gram_counts = n_gram_counts.most_common(n_common)
        return [(n_gram, count) for n_gram, count in n_gram_counts]
    else:
        return [(n_gram, count) for n_gram, count in n_gram_counts.items()]

# Example usage:
word_list = ["apple", "orange", "banana", "grape", "kiwi", "apple", "orange", "kiwi", "apple", "banana", "grape", "kiwi", "orange"]
n_value = 2
result = generate_ngrams_with_frequency(word_list, n_value, 5)
print(result)

[(('apple', 'orange'), 2), (('banana', 'grape'), 2), (('grape', 'kiwi'), 2), (('kiwi', 'apple'), 2), (('orange', 'banana'), 1)]


##### Clean text

In [24]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')

def clean_text(text_data: str, remove_stopwords: bool = True, lemmatize: bool = True) -> list:
    """
    Clean and preprocess text data.

    Parameters:
    - text_data (str): Input text data.
    - remove_stopwords (bool): Flag to indicate whether to remove stopwords or not.
    - lemmatize (bool): Flag to indicate whether to lemmatize words or not.

    Returns:
    - text_data_clean (list): List of cleaned and preprocessed words.
    """
    # Convert to lowercase
    text_data = text_data.lower()

    # Remove all non-English-letter characters
    text_data = re.sub(r'[^a-z]', ' ', text_data)

    # Create a list of words
    text_data = nltk.word_tokenize(text_data)

    # Lemmatize the words if lemmatize is True
    if lemmatize:
        wl = WordNetLemmatizer()
        text_data = [wl.lemmatize(word) for word in text_data]

    # Remove stopwords if remove_stopwords is True
    if remove_stopwords:
        text_data = [word for word in text_data if not word in set(stopwords.words('english'))]

    # Remove single letters
    text_data_clean = [word for word in text_data if len(word) > 1]

    return text_data_clean

# Example usage:
text_data = "This is an example sentence. It contains some words that we want to clean and preprocess for further analysis. Cleaning includes converting to lowercase, removing non-English-letter characters, tokenization, lemmatization, and removing stopwords."
cleaned_words = clean_text(text_data, remove_stopwords=True, lemmatize=False)
print(cleaned_words)

['example', 'sentence', 'contains', 'words', 'want', 'clean', 'preprocess', 'analysis', 'cleaning', 'includes', 'converting', 'lowercase', 'removing', 'non', 'english', 'letter', 'characters', 'tokenization', 'lemmatization', 'removing', 'stopwords']


##### Sentiment analysis
[link](https://medium.com/@rslavanyageetha/vader-a-comprehensive-guide-to-sentiment-analysis-in-python-c4f1868b0d2e#:~:text=text%20%3D%20%22I%20love%20Python!%22%0A%0Ascores%20%3D%20analyzer.polarity_scores(text)%0A%0Aprint(scores))

##### BOW and TFIDF

In [52]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
def vectorize(doc: np.array, vectorizer_type: str = 'bow'):
    """
    Vectorize documents using either Bag-of-Words (bow) or TF-IDF vectorizer.

    Parameters:
    - doc (np.array): Array of text documents.
    - vectorizer_type (str): Type of vectorizer to use ('bow' or 'tfidf').

    Returns:
    - Vectorized representation of the documents.
    """
    if vectorizer_type == 'bow':
        vectorizer = CountVectorizer()
    elif vectorizer_type == 'tfidf':
        vectorizer = TfidfVectorizer()
    else:
        raise ValueError("Invalid vectorizer type. Use 'bow' or 'tfidf'.")

    return vectorizer.fit_transform(doc).toarray()

# Example usage:
documents = [
    "This is the first document.", 
    "This document is the second document.", 
    "And this is the third one."
    ]
vectorized_bow = vectorize(documents, vectorizer_type='bow')
vectorized_tfidf = vectorize(documents, vectorizer_type='tfidf')

print(vectorized_tfidf)

[[0.         0.46941728 0.61722732 0.3645444  0.         0.
  0.3645444  0.         0.3645444 ]
 [0.         0.7284449  0.         0.28285122 0.         0.47890875
  0.28285122 0.         0.28285122]
 [0.49711994 0.         0.         0.29360705 0.49711994 0.
  0.29360705 0.49711994 0.29360705]]


### Networkx

##### Sparsity
"Sparsity" of a graph with $n$ nodes is defined as follows: 

$ L = \frac{|E|}{|E_{max}|}$, where $E_{max} = \frac{n * (n-1)}{2}$

$ L = \text{number of edges} / \text{potential number of edges} $
```python
nx.density(G)
```

##### Connected componets
```python
list(nx.connected_components(G)) # list of dictionaries
```

##### Diameter
"Diameter" = longest shortest-path

```python
nx.diameter(G)
```

##### Triadic Closure

__Clustering coefficient__ of a node
```python
nx.clustering(G, ['node'])
```
__Global Clustering coefficient__, or the ratio of all existing triangles over all possible triangles.
```python
nx.transitivity(G)
```

##### Centralities

- Degree centrality of node i = number of neighbours of node i
- Closeness centrality of node i = sum(1 / sum of distances from node i to other nodes)
- Betweenness centrality of node i = fraction of shortest pathes in the network that pass through node i
- Katz centrality - Generalization of Degree centrality
- PageRank centrality - I am more important if more important nodes are connected to me.

In [None]:
import networkx as nx

degree = nx.degree(G)
closeness = nx.closeness_centrality(G)
pagerank = nx.pagerank(G)
betweeness = nx.betweenness_centrality(G)
katz = nx.katz_centrality(G)

nx.set_node_attributes(G, degree, 'degree')
nx.set_node_attributes(G, closeness, 'closeness')
nx.set_node_attributes(G, pagerank, 'pagerank')
nx.set_node_attributes(G, betweeness, 'betweeness')
nx.set_node_attributes(G, katz, 'katz')

##### Community - lauvain

In [None]:
from community import community_louvain

partition = community_louvain.best_partition(G)
nx.set_node_attributes(G, partition, 'louvain_community')

##### Histogram

In [None]:
# histogram of df['column'] in linear and log
unique_val = np.unique(df['column'], return_counts=True)
deg_df = pd.DataFrame({'degree': unique_val[0], 'count': unique_val[1]})
deg_df.sort_values(by='degree', inplace=True)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(deg_df.degree, deg_df['count'], '.-', label='linear')
axes[1].loglog(deg_df.degree, deg_df['count'], '.-', label='log')
fig.suptitle('In-degree distribution')
plt.tight_layout()
plt.legend()
plt.show()
