In [43]:
import pandas as pd
import glob

# Define the pattern to match your CSV files
# Make sure to include *.csv to select only CSV files
pattern = r'C:\Users\valhk\Documents\MMA\Text analytics\Team project\cleaned posts\*.csv'
# Use glob to match all files following the pattern
file_paths = glob.glob(pattern)
output_file_path = r'C:\Users\valhk\Documents\MMA\Text analytics\Team project\cleaned posts\combined_blackfriday.csv'
# Initialize an empty list to store dataframes
dfs = []

# Iterate over the file paths and read each CSV file
for file_path in file_paths:
    try:
        df = pd.read_csv(file_path)
        dfs.append(df)
    except PermissionError as e:
        print(f"Permission denied for file: {file_path}. Error: {e}")

# Concatenate all dataframes into a single one, if any have been read
if dfs:
    combined_df = pd.concat(dfs, ignore_index=True)
else:
    print("No CSV files read. Please check the directory path and permissions.")
combined_df


Unnamed: 0,score,retrieved_on,subreddit_id,body
0,1,2018-09-03,amazonprime,mini sale day like black friday cyber monday g...
1,1,2018-09-05,amazonprime,pretty sure like black friday liquidation sale...
2,3,2018-09-18,amazonprime,prime times amazon offered one day discount to...
3,2,2018-09-23,amazonprime,black friday probably
4,2,2018-09-23,amazonprime,cyber monday prime day also amazon amazing dea...
...,...,...,...,...
49304,1,2022-12-14,walmart,blitz used call black friday sales learned ter...
49305,5,2022-12-14,walmart,dispel myth played cool said true wide open pi...
49306,5,2022-12-14,walmart,yup problem capitalism monetary incentive give...
49307,18,2022-12-14,walmart,whole pallet pw crockpots leftover black frida...


In [40]:
combined_df.to_csv(output_file_path, index=False)

In [15]:
import numpy as np

# Add year column
combined_df['year'] = combined_df['retrieved_on'].str[:4]

# Define your conditions and choices for the new column
conditions = [
    combined_df['year'] == '2018',
    combined_df['year'] == '2019',
    combined_df['year'] == '2020',
    combined_df['year'] == '2021',
    combined_df['year'] == '2022'
]

# Corresponding values for each condition
choices = [1, 2, 3, 4, 5]

# Use np.select to apply these conditions and choices
combined_df['var_year'] = np.select(conditions, choices, default=0)

# Now combined_df will have the 'var_year' column with values based on 'year'
combined_df = combined_df.drop(columns=['year', 'retrieved_on'])
original_df = combined_df
original_df.to_csv('original_full.csv', index=False)

In [16]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# Ensure the necessary NLTK resources are downloaded
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

# Initialize the WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert nltk POS tags to WordNet POS tags
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

# Function to lemmatize a sentence
def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = pos_tag(word_tokenize(sentence))  
    # Tuple of (word, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            # else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

# Create a copy of the DataFrame for lemmatization
lemmatized_df = combined_df.copy()

# Apply the lemmatization to the 'body' column
lemmatized_df['body'] = lemmatized_df['body'].apply(lemmatize_sentence)

lemmatized_df.to_csv('lemmatized_full.csv', index=False)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\valhk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\valhk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\valhk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

max_features = 5000

# Function to create a DataFrame with n-gram counts, using a sparse matrix
def create_ngram_df(text_series, original_columns_df, ngram_range):
    vectorizer = CountVectorizer(
        ngram_range=ngram_range,
        tokenizer=word_tokenize,
        token_pattern=r'(?u)\b[A-Za-z]+\b',
        max_features=max_features
    )
    X = vectorizer.fit_transform(text_series)
    ngram_df = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())
    # Join the original columns with the n-gram DataFrame
    final_df = pd.concat([original_columns_df.reset_index(drop=True), ngram_df], axis=1)
    print(max_features)
    return final_df

# Tokenize using monogram, bigram, trigram, and (1,3) grams
ngram_ranges = [(1, 1), (2, 2), (3, 3), (1, 3)]



# Selecting the original columns to keep
original_columns = combined_df[['score', 'subreddit_id', 'var_year']]
lemmatized_columns = lemmatized_df[['score', 'subreddit_id', 'var_year']]

# Create dictionaries for original and lemmatized dataframes with appropriate keys and including original columns
original_ngram_dfs = {f"{n[0]}_{n[1]}_grams_count": create_ngram_df(combined_df['body'], original_columns, n) for n in ngram_ranges}
lemmatized_ngram_dfs = {f"{n[0]}_{n[1]}_grams_count": create_ngram_df(lemmatized_df['body'], lemmatized_columns, n) for n in ngram_ranges}



5000




5000




5000




5000




5000




5000




5000




5000


In [6]:
monograms_count_o = original_ngram_dfs['1_1_grams_count']
bigrams_count_o = original_ngram_dfs['2_2_grams_count']
trigrams_count_o = original_ngram_dfs['3_3_grams_count']
one_three_grams_count_o = original_ngram_dfs['1_3_grams_count']

monograms_count_l = lemmatized_ngram_dfs['1_1_grams_count']
bigrams_count_l = lemmatized_ngram_dfs['2_2_grams_count']
trigrams_count_l = lemmatized_ngram_dfs['3_3_grams_count']
one_three_grams_count_l = lemmatized_ngram_dfs['1_3_grams_count']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk.tokenize import word_tokenize
import re

# Custom tokenizer function
def alpha_tokenizer(text):
    tokens = word_tokenize(text)
    alpha_tokens = [token for token in tokens if re.match(r'^[A-Za-z]+$', token)]
    return alpha_tokens

# Modified function to create a DataFrame with n-gram TF-IDF features and original columns
def create_ngram_tfidf_df(text_series, original_columns_df, ngram_range, max_features=5000):
    vectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        tokenizer=alpha_tokenizer,  # Use the custom tokenizer
        max_features=max_features  # Limit the number of features
    )
    X = vectorizer.fit_transform(text_series)
    ngram_df = pd.DataFrame.sparse.from_spmatrix(X, columns=vectorizer.get_feature_names_out())
    # Join the original columns with the n-gram DataFrame
    final_df = pd.concat([original_columns_df.reset_index(drop=True), ngram_df], axis=1)
    return final_df

# Assuming `combined_df` and `lemmatized_df` already have the 'score', 'subreddit_id', and 'var_year' columns
original_columns = combined_df[['score', 'subreddit_id', 'var_year']]
lemmatized_columns = lemmatized_df[['score', 'subreddit_id', 'var_year']]

# Create dictionaries for original and lemmatized dataframes with appropriate keys and including original columns
original_ngram_dfs_tfidf = {
    f"{n[0]}_{n[1]}_grams_tfidf": create_ngram_tfidf_df(combined_df['body'], original_columns, n) for n in ngram_ranges
}
lemmatized_ngram_dfs_tfidf = {
    f"{n[0]}_{n[1]}_grams_tfidf": create_ngram_tfidf_df(lemmatized_df['body'], lemmatized_columns, n) for n in ngram_ranges
}




In [9]:
monograms_count_tfidf_o = original_ngram_dfs_tfidf['1_1_grams_tfidf']
bigrams_count_tfidf_o = original_ngram_dfs_tfidf['2_2_grams_tfidf']
trigrams_count_tfidf_o = original_ngram_dfs_tfidf['3_3_grams_tfidf']
one_three_grams_count_tfidf_o = original_ngram_dfs_tfidf['1_3_grams_tfidf']

monograms_count_tfidf_l = lemmatized_ngram_dfs_tfidf['1_1_grams_tfidf']
bigrams_count_tfidf_l = lemmatized_ngram_dfs_tfidf['2_2_grams_tfidf']
trigrams_count_tfidf_l = lemmatized_ngram_dfs_tfidf['3_3_grams_tfidf']
one_three_grams_count_tfidf_l = lemmatized_ngram_dfs_tfidf['1_3_grams_tfidf']

In [10]:
trigrams_count_tfidf_l

Unnamed: 0,score,subreddit_id,var_year,able add cart,able find anything,able get away,able get deal,able get one,able get price,able order one,...,year work retail,year work thanksgiving,year year ago,year year year,yes best buy,yes black friday,yesterday black friday,youtu http youtu,youtube com watch,zelda breath wild
0,1,amazonprime,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,amazonprime,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,amazonprime,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2,amazonprime,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2,amazonprime,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49304,1,walmart,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49305,5,walmart,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49306,5,walmart,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
49307,18,walmart,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
# Saving monograms count DataFrames to CSV
monograms_count_o.to_csv('monograms_count_original.csv', index=False)

In [34]:

monograms_count_l.to_csv('monograms_count_lemmatized.csv', index=False)


In [None]:

# Saving bigrams count DataFrames to CSV
bigrams_count_o.to_csv('bigrams_count_original.csv', index=False)
bigrams_count_l.to_csv('bigrams_count_lemmatized.csv', index=False)

# Saving trigrams count DataFrames to CSV
trigrams_count_o.to_csv('trigrams_count_original.csv', index=False)
trigrams_count_l.to_csv('trigrams_count_lemmatized.csv', index=False)

# Saving one to three grams count DataFrames to CSV
one_three_grams_count_o.to_csv('one_three_grams_count_original.csv', index=False)


In [11]:

one_three_grams_count_l.to_csv('one_three_grams_count_lemmatized.csv', index=False)


In [12]:

# Saving TF-IDF DataFrames to CSV
monograms_count_tfidf_o.to_csv('monograms_count_tfidf_original.csv', index=False)
monograms_count_tfidf_l.to_csv('monograms_count_tfidf_lemmatized.csv', index=False)
bigrams_count_tfidf_o.to_csv('bigrams_count_tfidf_original.csv', index=False)
bigrams_count_tfidf_l.to_csv('bigrams_count_tfidf_lemmatized.csv', index=False)
trigrams_count_tfidf_o.to_csv('trigrams_count_tfidf_original.csv', index=False)
trigrams_count_tfidf_l.to_csv('trigrams_count_tfidf_lemmatized.csv', index=False)
one_three_grams_count_tfidf_o.to_csv('one_three_grams_count_tfidf_original.csv', index=False)
one_three_grams_count_tfidf_l.to_csv('one_three_grams_count_tfidf_lemmatized.csv', index=False)

In [None]:
print("done")