LOAD DATASET, CREATE FOLDERS

In [1]:
import os
import shutil
import random
import re

from google.colab import drive
drive.mount('/content/drive')

folder = '/content/drive/My Drive/text_files'
original_folder = '/content/original_files'

try:
    shutil.rmtree(original_folder)
except FileNotFoundError:
    pass
shutil.copytree(folder, original_folder)
os.chdir(original_folder)
files = os.listdir(original_folder)

def natural_sort_key(s):
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]
files = sorted(files, key=natural_sort_key)
count = len(files)

doc_ids = {}
for i in range(1,count+1):
    doc_ids[i] = files[i-1]
print(doc_ids)

random_files = random.sample(files, 5)
random_files = sorted(random_files, key=natural_sort_key)
print(random_files)

preprocessed_folder = '/content/preprocessed_files'
try:
    shutil.rmtree(preprocessed_folder)
except FileNotFoundError:
    pass
os.makedirs("/content/preprocessed_files")
for i in range(1, count+1):
    file = f"file{i}.txt"
    path = os.path.join(preprocessed_folder, file)
    with open(path, 'w') as file:
        pass

tokenized_folder = '/content/tokenized_files'
try:
    shutil.rmtree(tokenized_folder)
except FileNotFoundError:
    pass
os.makedirs("/content/tokenized_files")
for i in range(1, count+1):
    file = f"file{i}.txt"
    path = os.path.join(tokenized_folder, file)
    with open(path, 'w') as file:
        pass

Mounted at /content/drive
{1: 'file1.txt', 2: 'file2.txt', 3: 'file3.txt', 4: 'file4.txt', 5: 'file5.txt', 6: 'file6.txt', 7: 'file7.txt', 8: 'file8.txt', 9: 'file9.txt', 10: 'file10.txt', 11: 'file11.txt', 12: 'file12.txt', 13: 'file13.txt', 14: 'file14.txt', 15: 'file15.txt', 16: 'file16.txt', 17: 'file17.txt', 18: 'file18.txt', 19: 'file19.txt', 20: 'file20.txt', 21: 'file21.txt', 22: 'file22.txt', 23: 'file23.txt', 24: 'file24.txt', 25: 'file25.txt', 26: 'file26.txt', 27: 'file27.txt', 28: 'file28.txt', 29: 'file29.txt', 30: 'file30.txt', 31: 'file31.txt', 32: 'file32.txt', 33: 'file33.txt', 34: 'file34.txt', 35: 'file35.txt', 36: 'file36.txt', 37: 'file37.txt', 38: 'file38.txt', 39: 'file39.txt', 40: 'file40.txt', 41: 'file41.txt', 42: 'file42.txt', 43: 'file43.txt', 44: 'file44.txt', 45: 'file45.txt', 46: 'file46.txt', 47: 'file47.txt', 48: 'file48.txt', 49: 'file49.txt', 50: 'file50.txt', 51: 'file51.txt', 52: 'file52.txt', 53: 'file53.txt', 54: 'file54.txt', 55: 'file55.txt', 5

LOWERCASE

In [2]:
import pandas as pd

print("Before preprocessing:\n")

for i in files:
    path = os.path.join(original_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        if i in random_files:
            print(i, " :")
            print(content, end="\n\n")
        content = content.lower()

    path = os.path.join(preprocessed_folder, i)
    with open(path, 'w') as file:
        file.write(content)

print("----------------------------------------------------------------")
print("After preprocessing:\n")

for i in random_files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        print(i, " :")
        print(content, end="\n\n")


Before preprocessing:

file396.txt  :
This pickup has a great, traditional, single coil strat sound. I'm really pleased with the sweet clear tone this delivers. You can make it scream with your amp setting if that's what you want. It's really great looking, too! I had no issues at all with the soldering connections, and the volume and tone pots work well as does the jack. I used this on a cigar box guitar build, and I love the tone! This homemade CBG sounds like a legitimate strat. I would definitely use this pickup again in the future, and I have no doubt it would sound good to swap out the pups on a cheapo commercial strat with these. My only complaint, and it's a small one, is that the knobs included had a tiny bit of smearing on the gold paint. It's not a big deal because many pups don't include knobs at all, and there are so many knob options available at very affordable prices. I actually used the knobs anyway, so I'm not going to knock a star off for that. The tone and looks of 

TOKENIZATION

In [3]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

print("Before preprocessing:\n")

for i in files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        if i in random_files:
            print(i, " :")
            print(content, end="\n\n")
        tokens = word_tokenize(content)

    with open(path, 'w') as file:
        file.write(str(tokens))

    path = os.path.join(tokenized_folder, i)
    with open(path, 'w') as file:
        file.write(str(tokens))

print("----------------------------------------------------------------")
print("After preprocessing:\n")

for i in random_files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        print(i, " :")
        print(content, end="\n\n")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Before preprocessing:

file396.txt  :
this pickup has a great, traditional, single coil strat sound. i'm really pleased with the sweet clear tone this delivers. you can make it scream with your amp setting if that's what you want. it's really great looking, too! i had no issues at all with the soldering connections, and the volume and tone pots work well as does the jack. i used this on a cigar box guitar build, and i love the tone! this homemade cbg sounds like a legitimate strat. i would definitely use this pickup again in the future, and i have no doubt it would sound good to swap out the pups on a cheapo commercial strat with these. my only complaint, and it's a small one, is that the knobs included had a tiny bit of smearing on the gold paint. it's not a big deal because many pups don't include knobs at all, and there are so many knob options available at very affordable prices. i actually used the knobs anyway, so i'm not going to knock a star off for that. the tone and looks of 

STOPWORDS

In [4]:
from nltk.corpus import stopwords
import ast
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

print("Before preprocessing:\n")

for i in files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        if i in random_files:
            print(i, " :")
            print(content, end="\n\n")
        tokens = ast.literal_eval(content)
        filtered = [j for j in tokens if j.lower() not in stop_words]

    with open(path, 'w') as file:
        file.write(str(filtered))

print("----------------------------------------------------------------")
print("After preprocessing:\n")

for i in random_files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        print(i, " :")
        print(content, end="\n\n")


Before preprocessing:



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


file396.txt  :
['this', 'pickup', 'has', 'a', 'great', ',', 'traditional', ',', 'single', 'coil', 'strat', 'sound', '.', 'i', "'m", 'really', 'pleased', 'with', 'the', 'sweet', 'clear', 'tone', 'this', 'delivers', '.', 'you', 'can', 'make', 'it', 'scream', 'with', 'your', 'amp', 'setting', 'if', 'that', "'s", 'what', 'you', 'want', '.', 'it', "'s", 'really', 'great', 'looking', ',', 'too', '!', 'i', 'had', 'no', 'issues', 'at', 'all', 'with', 'the', 'soldering', 'connections', ',', 'and', 'the', 'volume', 'and', 'tone', 'pots', 'work', 'well', 'as', 'does', 'the', 'jack', '.', 'i', 'used', 'this', 'on', 'a', 'cigar', 'box', 'guitar', 'build', ',', 'and', 'i', 'love', 'the', 'tone', '!', 'this', 'homemade', 'cbg', 'sounds', 'like', 'a', 'legitimate', 'strat', '.', 'i', 'would', 'definitely', 'use', 'this', 'pickup', 'again', 'in', 'the', 'future', ',', 'and', 'i', 'have', 'no', 'doubt', 'it', 'would', 'sound', 'good', 'to', 'swap', 'out', 'the', 'pups', 'on', 'a', 'cheapo', 'commercial'

PUNCTUATIONS

In [5]:
import string
punctuations = set(string.punctuation)

print("Before preprocessing:\n")

for i in files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        if i in random_files:
            print(i, " :")
            print(content, end="\n\n")
        tokens = ast.literal_eval(content)
        filtered = [j for j in tokens if j.lower() not in punctuations]
        # print(content)

    with open(path, 'w') as file:
        file.write(str(filtered))

print("----------------------------------------------------------------")
print("After preprocessing:\n")

for i in random_files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        print(i, " :")
        print(content, end="\n\n")


Before preprocessing:

file396.txt  :
['pickup', 'great', ',', 'traditional', ',', 'single', 'coil', 'strat', 'sound', '.', "'m", 'really', 'pleased', 'sweet', 'clear', 'tone', 'delivers', '.', 'make', 'scream', 'amp', 'setting', "'s", 'want', '.', "'s", 'really', 'great', 'looking', ',', '!', 'issues', 'soldering', 'connections', ',', 'volume', 'tone', 'pots', 'work', 'well', 'jack', '.', 'used', 'cigar', 'box', 'guitar', 'build', ',', 'love', 'tone', '!', 'homemade', 'cbg', 'sounds', 'like', 'legitimate', 'strat', '.', 'would', 'definitely', 'use', 'pickup', 'future', ',', 'doubt', 'would', 'sound', 'good', 'swap', 'pups', 'cheapo', 'commercial', 'strat', '.', 'complaint', ',', "'s", 'small', 'one', ',', 'knobs', 'included', 'tiny', 'bit', 'smearing', 'gold', 'paint', '.', "'s", 'big', 'deal', 'many', 'pups', "n't", 'include', 'knobs', ',', 'many', 'knob', 'options', 'available', 'affordable', 'prices', '.', 'actually', 'used', 'knobs', 'anyway', ',', "'m", 'going', 'knock', 'star', 

BLANK SPACE

In [6]:
print("Before preprocessing:\n")

for i in files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        if i in random_files:
            print(i, " :")
            print(content, end="\n\n")
        tokens = ast.literal_eval(content)
        filtered = [j for j in tokens if j.lower().strip()]
        # print(content)

    with open(path, 'w') as file:
        file.write(str(filtered))

print("----------------------------------------------------------------")
print("After preprocessing:\n")

for i in random_files:
    path = os.path.join(preprocessed_folder, i)
    with open(path, 'r') as file:
        content = file.read()
        print(i, " :")
        print(content, end="\n\n")


Before preprocessing:

file396.txt  :
['pickup', 'great', 'traditional', 'single', 'coil', 'strat', 'sound', "'m", 'really', 'pleased', 'sweet', 'clear', 'tone', 'delivers', 'make', 'scream', 'amp', 'setting', "'s", 'want', "'s", 'really', 'great', 'looking', 'issues', 'soldering', 'connections', 'volume', 'tone', 'pots', 'work', 'well', 'jack', 'used', 'cigar', 'box', 'guitar', 'build', 'love', 'tone', 'homemade', 'cbg', 'sounds', 'like', 'legitimate', 'strat', 'would', 'definitely', 'use', 'pickup', 'future', 'doubt', 'would', 'sound', 'good', 'swap', 'pups', 'cheapo', 'commercial', 'strat', 'complaint', "'s", 'small', 'one', 'knobs', 'included', 'tiny', 'bit', 'smearing', 'gold', 'paint', "'s", 'big', 'deal', 'many', 'pups', "n't", 'include', 'knobs', 'many', 'knob', 'options', 'available', 'affordable', 'prices', 'actually', 'used', 'knobs', 'anyway', "'m", 'going', 'knock', 'star', 'tone', 'looks', 'pickup', 'fantastic', 'price']

file464.txt  :
['happy', 'baritone', 'ukulele', 'w