In [8]:
import re
from string import punctuation
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

######################## Text Exploration ########################

# Regular expression - a language to identify pattern/sequence of character

## grep functions (without using any package)
ww = ["statistics", "estate", "castrate", "catalyst", "Statistics"]
ss = ["I like statistics", "I like bananas", "Estates and statues are expensive"]

# 1st function - grep equivalent - give the location/index of pattern
def grep(pattern, strings, ignore_case=False, value=False):
    flags = re.IGNORECASE if ignore_case else 0
    matches = [(i, s) for i, s in enumerate(strings) if re.search(pattern, s, flags)]
    if value:
        return [s for i, s in matches]
    return [i for i, s in matches]

print(grep("stat", ww))  # returns indices
print(grep("stat", ww, ignore_case=True))  # ignore case, returns indices
print(grep("stat", ww, ignore_case=True, value=True))  # ignore case, returns matched strings

# 2nd function - grepl equivalent - give logical expression
def grepl(pattern, strings, ignore_case=False):
    flags = re.IGNORECASE if ignore_case else 0
    return [bool(re.search(pattern, s, flags)) for s in strings]

print(grepl("stat", ww))  # Return true/false
print(grepl("stat", ss))

# 3rd function - regexpr equivalent
def regexpr(pattern, strings):
    results = []
    for s in strings:
        match = re.search(pattern, s)
        if match:
            results.append((match.start(), match.end() - match.start()))
        else:
            results.append((-1, -1))
    return results

print(regexpr("stat", ww))
print(regexpr("stat", ss))

# 4th function - gregexpr equivalent
def gregexpr(pattern, strings):
    return [[(m.start(), m.end() - m.start()) for m in re.finditer(pattern, s)] for s in strings]

print(gregexpr("stat", ss))

# 5th function - regexec equivalent
def regexec(pattern, strings):
    results = []
    for s in strings:
        match = re.search(pattern, s)
        if match:
            groups = match.groups()
            results.append((match.start(), match.end() - match.start(), groups) if groups else (match.start(), match.end() - match.start()))
        else:
            results.append((-1, -1))
    return results

print(regexec("(st)(at)", ww))

# 6th function - sub equivalent
def sub(pattern, replacement, strings, ignore_case=False):
    flags = re.IGNORECASE if ignore_case else 0
    return [re.sub(pattern, replacement, s, flags=flags) for s in strings]

print(sub("stat", "STAT", ww, ignore_case=True))
print(sub("stat", "STAT", ss, ignore_case=True))

# 7th function - gsub equivalent (same as sub in Python)
def gsub(pattern, replacement, strings, ignore_case=False):
    return sub(pattern, replacement, strings, ignore_case)

print(gsub("stat", "STAT", ss, ignore_case=True))

# Common string operations
text = "This is STQD6114"
print(len(text))  # str_length equivalent

sentences = ["This is a sentence.", "Another sentence here."]
print([s.split() for s in sentences])  # str_split equivalent

print("".join(["a", "b", "c"]))  # str_c equivalent
print(["A" + s for s in ["li", "bu", "ngry"]])  # str_c with vector
print(", ".join(["one for all", "All for one"]))  # str_c with sep

x = ["Apple", "Banana", "Pear"]

# str_sub equivalent
print([s[:3] for s in x])  # First 3 characters
print([s[-3:] for s in x])  # Last 3 characters

print([s.upper() for s in x])  # str_to_upper
print([s.lower() for s in x])  # str_to_lower
print("Unstructured Data Analytics".title())  # str_to_title

# Regular expression patterns
fruit = ["apple", "banana", "pear", "orange", "grape"]
print([f for f in fruit if re.search("an", f)])  # str_view equivalent
print([f for f in fruit if re.search("^a", f)])  # starts with a
print([f for f in fruit if re.search("a$", f)])  # ends with a
print([f for f in fruit if re.search("^...$", f)])  # exactly 3 characters

ee = ["sum", "summarize", "rowsum", "summary"]
print([s for s in ee if re.search(r"\bsum", s)])  # starts with sum
print([s for s in ee if re.search(r"sum\b", s)])  # ends with sum

ss_num = ["This is a class with students", "There are 18 students", "This class is from 11.00 am"]
print([s for s in ss_num if re.search(r"\d", s)])  # contains digits
print([s for s in ss_num if re.search(r"\s", s)])  # contains whitespace

# Character classes
print([f for f in fruit if re.search(r"[abc]", f)])  # contains a, b, or c
print([f for f in fruit if re.search(r"^[abc]", f)])  # starts with a, b, or c

# Repetition
ex = "aabbbccddddeeeee"
print(re.findall(r"aab?", ex))
print(re.findall(r"(a|d){2}", ex))
print(re.findall(r"de+", ex))
print(re.findall(r"\d+", "There are 123 apples and 45 bananas"))

# Grouping and backreferencing
print([f for f in fruit if re.search(r"(a).\1", f)])  # a followed by any char, then a
print([f for f in fruit if re.search(r"(a)(.)\1\2", f)])  # a, any char, a, same char

# Text cleaning and preprocessing
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

# Example cleaning
sample_text = "This is a sample text with some numbers 123 and punctuation!"
print(clean_text(sample_text))

# Word frequency analysis
corpus = ["This is the first document.",
          "This document is the second document.",
          "And this is the third one.",
          "Is this the first document?"]

# Create word frequency
words = []
for doc in corpus:
    words.extend(word_tokenize(clean_text(doc)))
word_freq = Counter(words)

# Create DataFrame
wf = pd.DataFrame.from_dict(word_freq, orient='index', columns=['FREQ'])
wf.index.name = 'TERM'
wf = wf.reset_index()
print(wf.head())

# Plotting
plt.figure(figsize=(10, 5))
wf[wf['FREQ'] >= 2].plot.bar(x='TERM', y='FREQ')
plt.xticks(rotation=45)
plt.show()

# Wordcloud
wordcloud = WordCloud(width=800, height=400).generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# For more advanced wordclouds (like wordcloud2 in R), you might need to use libraries like:
# from wordcloud import WordCloud, ImageColorGenerator
# or other visualization libraries like plotly

ModuleNotFoundError: No module named 'wordcloud'

In [12]:
import re
from string import punctuation
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Text processing functions
def clean_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(punctuation)}]", "", text)
    text = re.sub(r"\d+", "", text)
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [w for w in words if w not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    return " ".join(words)

# Sample corpus
corpus = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Process and count words
words = []
for doc in corpus:
    words.extend(word_tokenize(clean_text(doc)))
word_freq = Counter(words)

# Create DataFrame
wf = pd.DataFrame.from_dict(word_freq, orient='index', columns=['FREQ'])
wf.index.name = 'TERM'
wf = wf.reset_index()

# Plot word frequencies
plt.figure(figsize=(10, 5))
wf[wf['FREQ'] >= 2].plot.bar(x='TERM', y='FREQ')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Word Cloud with error handling
try:
    from wordcloud import WordCloud
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
    
    # Display the word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    
except ImportError:
    print("WordCloud package not installed. Please install it using:")
    print("pip install wordcloud")
    print("Here's a bar chart instead:")
    plt.figure(figsize=(10, 5))
    wf.sort_values('FREQ', ascending=False).head(10).plot.bar(x='TERM', y='FREQ')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

ModuleNotFoundError: No module named 'nltk'

In [5]:
import pandas as pd

movie = pd.read_table('movies/Doc1.txt')
movie.head()
print('--------------------')

--------------------


In [1]:
import os
os.getcwd()

'c:\\Users\\user\\Documents\\Unstructured-Data-Analysis\\02. Text Data Mining'