## 1. np.arange (10 odd numbers)

In [1]:
import numpy as np
odd10 = np.arange(11, 31, 2)[:10]
odd10


array([11, 13, 15, 17, 19, 21, 23, 25, 27, 29])

## 2. arr1 (4×5) and red-square slice

In [2]:
rng = np.random.default_rng(1)
arr1 = rng.integers(1, 30, size=(4,5))
arr1, arr1[1:3, 1:4]


(array([[14, 15, 22, 28,  2],
        [ 5, 24, 28,  8, 10],
        [26, 13,  8, 25,  8],
        [12, 19, 16,  3,  1]], dtype=int64),
 array([[24, 28,  8],
        [13,  8, 25]], dtype=int64))

## 3. arr2 (4×2) and horizontal stack with arr1

In [3]:
arr2 = np.array([[27,25],
                 [ 8,12],
                 [11,26],
                 [25, 9]])
stacked = np.hstack([arr1, arr2])
arr2, stacked


(array([[27, 25],
        [ 8, 12],
        [11, 26],
        [25,  9]]),
 array([[14, 15, 22, 28,  2, 27, 25],
        [ 5, 24, 28,  8, 10,  8, 12],
        [26, 13,  8, 25,  8, 11, 26],
        [12, 19, 16,  3,  1, 25,  9]], dtype=int64))

## 4. Element-wise vs matrix multiplication; reshape to enable matmul

In [4]:
# Element-wise (will error because shapes (4,5) and (4,2) don't align)
elemwise_error = None
try:
    elem = arr1 * arr2
except Exception as e:
    elemwise_error = str(e)

# Matrix multiplication (will also error: (4x5)@(4x2) invalid)
matmul_error = None
try:
    mm = arr1 @ arr2
except Exception as e:
    matmul_error = str(e)

# Reshape arr1 to (5x4) so (5x4)@(4x2)->(5x2) works
arr1_rs = arr1.reshape(5,4)
good_mm = arr1_rs @ arr2
elemwise_error, matmul_error, arr1_rs.shape, good_mm.shape


('operands could not be broadcast together with shapes (4,5) (4,2) ',
 'matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 4 is different from 5)',
 (5, 4),
 (5, 2))

## 5. Data-cleaning pipeline (brief list)

In [5]:
steps = [
    "Explore (shape/head/dtypes)",
    "Drop unused columns",
    "Handle missing values",
    "Deduplicate by key (url)",
    "Validate ranges/outliers (rating in [0,5])",
    "Standardize categories (institution/type)",
    "Type casting",
    "Document steps"
]
steps


['Explore (shape/head/dtypes)',
 'Drop unused columns',
 'Handle missing values',
 'Deduplicate by key (url)',
 'Validate ranges/outliers (rating in [0,5])',
 'Standardize categories (institution/type)',
 'Type casting',
 'Document steps']

## 6. Overview: shape, head(10), dtypes

In [6]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv('/mnt/data/webautomation_coursera.csv')
df.shape, df.head(10), df.dtypes


<class 'ModuleNotFoundError'>: No module named 'pandas'

## 7. Drop unused columns

In [None]:
to_drop = ['image', 'description', 'prerequisites', 'syllabus']
df = df.drop(columns=[c for c in to_drop if c in df.columns])
df.shape


## 8. Missing values (count and %)

In [None]:
missing_count = df.isna().sum()
missing_pct = (missing_count / len(df) * 100).round(2)
pd.DataFrame({'missing_count': missing_count, 'missing_%': missing_pct}).sort_values('missing_count', ascending=False)


## 9. Fill language, drop missing rating, drop rows with >3 missing

In [None]:
if 'language' in df.columns:
    df['language'] = df['language'].fillna('English')

before = df.shape[0]
if 'rating' in df.columns:
    df = df.dropna(subset=['rating'])
after_missing_rating = before - df.shape[0]

before = df.shape[0]
df = df[df.isna().sum(axis=1) <= 3]
after_row_filter = before - df.shape[0]

after_missing_rating, after_row_filter, df.shape


## 10. Deduplicate by url (keep last) and report removed

In [None]:
before = df.shape
if 'url' in df.columns:
    df = df.drop_duplicates(subset=['url'], keep='last')
after = df.shape
before, after, (before[0] - after[0])


## 11. Remove rating outliers (keep 0–5)

In [None]:
if 'rating' in df.columns:
    bad = (df['rating'] < 0) | (df['rating'] > 5)
    removed = int(bad.sum())
    df = df[~bad].copy()
    removed, df.shape
else:
    df.shape


## 12. Rename institution column and merge common variants; clean type

In [None]:
def _norm(s):
    if pd.isna(s): return s
    s = str(s).strip().lower().replace('&', 'and')
    for ch in [',', '.']:
        s = s.replace(ch, ' ')
    s = ' '.join(s.split())
    return s

if 'associated-university-institution-company' in df.columns:
    df = df.rename(columns={'associated-university-institution-company':'institution'})

canon = {
    'uwg': 'University of West Georgia',
    'university of west georgia': 'University of West Georgia',
    'ga tech': 'Georgia Tech',
    'ga tech college': 'Georgia Tech',
    'georgia institute of technology': 'Georgia Tech',
    'georgia tech': 'Georgia Tech',
}

if 'institution' in df.columns:
    n = df['institution'].map(_norm)
    found = sorted(set([v for v in n.dropna().unique() if v in canon]))
    df['institution'] = [canon.get(x, orig) for x, orig in zip(n, df['institution'])]
    found  # show which normalized variants were found and merged

if 'type' in df.columns:
    tmap = {'beginner':'Beginner', 'intermediate':'Intermediate', 'advanced':'Advanced',
            'mixed':'Mixed', 'all levels':'All Levels'}
    tn = df['type'].map(_norm)
    df['type'] = [tmap.get(x, orig) for x, orig in zip(tn, df['type'])]


## 13. Average rating by institution (Top 5)

In [None]:
if {'institution','rating'}.issubset(df.columns):
    top5 = (df.groupby('institution', dropna=True)['rating']
              .mean()
              .sort_values(ascending=False)
              .round(3)
              .head(5)
              .reset_index(name='avg_rating'))
    top5


## 14. Pivot: number of courses by level for each institution; show top providers

In [7]:
if {'institution','type'}.issubset(df.columns):
    value_col = 'url' if 'url' in df.columns else df.columns[0]
    pivot = df.pivot_table(index='institution', columns='type',
                           values=value_col, aggfunc='count', fill_value=0)
    display(pivot)
    for lvl in ['Advanced','Intermediate','Beginner']:
        if lvl in pivot.columns:
            m = pivot[lvl].max()
            winners = pivot.index[pivot[lvl]==m].tolist()
            print(f"{lvl}: {winners} with {m} courses")


<class 'NameError'>: name 'df' is not defined